��Ѷ��

�ĵ��鷴��̨

��/��/��ţ

��

ʹ��Pythonʵ�ֻ��ѧϰ��ѡ��4�ַ��

��Դ�� - ��THU

��ߣ�Sugandha Lahoti

��룺��

У�ԣ��

��Լ3500�֣���Ķ�13��ӡ�

��У��ǽ��о��ݼ��ѡ��Ĳ�ͬ��;ͬʱͨ��ʹ��Python��Scikit-learn (sklearn)��ʵ��ѡ��㷨��͡�

ע��Ľ�ѡ��Ankit Dixit��ġ��ɻ��ѧϰ��(Ensemble Machine Learning)һ�顣�Ȿ��ǿ��Ļ��ѧϰ�㷨��Ż�ģ�ͣ��Ϊ��ѧ�ߵ�ָ�ϡ�

�ڱ��У��ǽ��о��ݼ��ѡ��Ĳ�ͬ��;ͬʱͨ��ʹ��Python��Scikit-learn (sklearn)��ʵ��ѡ��㷨��:

��ѡ��

�ݹ��(RFE)

��ɷַ��(PCA)

ѡ��Ҫ��(��Ҫ��)

��Ǽ�Ҫ��ǰ��㷨��ʵ�֡�Ȼ��ǽ��ϸ��ݿ�ѧ��й㷺ʹ�õ�ѡ��Ҫ��(��Ҫ��)��ֵ��ݡ�

��ѡ��

ͳ�Ʋ��Կ��ѡ��Щ��ϵ��ǿ��

scikit-learn��ṩ��SelectKBest�࣬��һ�鲻ͬ��ͳ�Ʋ��һ��ʹ�ã��ѡ��ض��

��ʹ��chi?�Ǹ��ͳ�Ʋ��ԣ��Ƥ��ӡ�ڰ��򲡷��ݼ��ѡ��ĸ��õ��:

1. #Feature Extraction with Univariate Statistical Tests (Chi-squared for classification)

2. #Import the required packages

3. #Import pandas to read csv import pandas

4. #Import numpy for array related operations import numpy

5. #Import sklearn's feature selection algorithm

6. from sklearn.feature_selection import SelectKBest

7. #Import chi2 for performing chi square test from sklearn.feature_selection import chi2

8. #URL for loading the dataset

9. url ="https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians diabetes/pima-indians-diabetes.data"

10. #Define the attribute names

11. names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

12. #Create pandas data frame by loading the data from URL

13. dataframe = pandas.read_csv(url, names=names

14. #Create array from data values

15. array = dataframe.values

16. #Split the data into input and target

17. X = array[:, 0:8]

18. Y = array[:,8]

19. #We will select the features using chi square

20. test = SelectKBest(score_func=chi2, k=4)

21. #Fit the function for ranking the features by score

22. fit = test.fit(X, Y)

23. #Summarize scores numpy.set_printoptions(precision=3) print(fit.scores_)

24. #Apply the transformation on to dataset

25. features = fit.transform(X)

26. #Summarize selected features print(features[0:5,:])

��Կ��ÿ��ĵ÷֣��Լ��ѡ��ĸ��(�÷��ߵ�):plas��test��mass��age��

ÿ��ķ��Ϊ��

1. [111.52 1411.887 17.605 53.108 2175.565 127.669 5.393

2. 181.304]

��ѡ��ǣ�

1. [[148. 0. 33.6 50. ]

2. [85. 0. 26.6 31. ]

3. [183. 0. 23.3 32. ]

4. [89. 94. 28.1 21. ]

5. [137. 168. 43.1 33. ]]

�ݹ��(RFE)

RFE�Ĺ��ʽ�ǵݹ��ɾ��ڱ��Ĳ��Ϲ��ģ�͡��ʹ��ģ�;��ж��Щ��(�Լ��Ե��)��Ԥ��Ŀ��scikit-learn��ĵ��˽��RFE��Ϣ��

��ʾ��ʹ��RFE��logistic�ع��㷨��ѡ��ǰ��㷨��ѡ�񲢲��Ҫ��ֻ��Ҫ��һ��:

1. #Import the required packages

2. #Import pandas to read csv import pandas

3. #Import numpy for array related operations import numpy

4. #Import sklearn's feature selection algorithm from sklearn.feature_selection import RFE

5. #Import LogisticRegression for performing chi square test from sklearn.linear_model import LogisticRegression

6. #URL for loading the dataset

7. url =

8. "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians-dia betes/pima-indians-diabetes.data"

9. #Define the attribute names

10. names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

11. #Create pandas data frame by loading the data from URL

12. dataframe = pandas.read_csv(url, names=names)

13.

14. #Create array from data values

15. array = dataframe.values

16.

17. #Split the data into input and target

18. X = array[:,:8]

19. Y = array[:,8]

20. #Feature extraction

21. model = LogisticRegression() rfe = RFE(model, 3)

22. fit = rfe.fit(X, Y)

23. print("Num Features: %d"% fit.n_features_) print("Selected Features: %s"% fit.support_)

24. print("Feature Ranking: %s"% fit.ranking_)

ִ��ǿ��Եõ�:

1. Num Features: 3

2. Selected Features: [ True False False False False True True False]

3. Feature Ranking: [1 2 3 5 6 1 1 4]

��Կ��RFEѡ��ǰ��ԣ��preg��mass��pedi��Щ��support_��б��ΪTrue��ranking_��б��Ϊ��ѡ��Ϊ1��

��ɷַ��

PCAʹ��Դ��ݼ�ת��Ϊѹ��ʽ��ͨ��Ϊ��һ��Լ��PCA��һ��ǣ��ѡ��ת��е�ά��ɷֵ��

�ڽ��У��ʹ��PCA��ѡ��ɷ�:

1. #Import the required packages

2. #Import pandas to read csv import pandas

3. #Import numpy for array related operations import numpy

4. #Import sklearn's PCA algorithm

5. from sklearn.decomposition import PCA

6. #URL for loading the dataset

7. url =

8. "https://archive.ics.uci.edu/ml/machine-learning-databases/pima-indians diabetes/pima-indians-diabetes.data"

9. #Define the attribute names

10. names = ['preg', 'plas', 'pres', 'skin', 'test', 'mass', 'pedi', 'age', 'class']

11. dataframe = pandas.read_csv(url, names=names)

12. #Create array from data values

13. array = dataframe.values

14. #Split the data into input and target

15. X = array[:,0:8]

16. Y = array[:,8]

17. #Feature extraction

18. pca = PCA(n_components=3) fit = pca.fit(X)

19. #Summarize components

20. print("Explained Variance: %s") % fit.explained_variance_ratio_

21. print(fit.components_)

��Կ��ת��ݼ�(��ɷ�)��Դ��ݼ��û��֮��:

ѡ��Ҫ��(��Ҫ��)

��Ҫ��һ��ѵ��õ��мල��ѡ��ļ��ѵ��(��)ʱ��Ǽ��ÿ��Դ��ָ�;��ǿ��ʹ��Ϊ��ѡ��ϸ�˽�һ�¡�

��ɭ��ԽϺõ�׼ȷ�ԡ�³��Ժ��Զ��Ϊ��ܻ�ӭ�Ļ��ѧϰ��֮һ��ǻ��ṩ��ּ��е��ѡ�񷽷��ֵ��ʺ;�ֵ��׼ȷ�ȡ�

��ɭ��ɡ��е�ÿ��ڵ㶼��һ��ڵ��Ŀ��ǽ��ݼ��ָ��Ա��Ƶ��Ӧֵ��ճ��ͬ�ļ��С�ѡ��(�ֲ�)��Ķ������ڷ��⣬��ͨ��ǻ��ʻ��Ϣ��/�أ��ڻع��Ƿ����ˣ��ѵ��һ��ʱ��ͨ��ÿ��ٵ��м�Ȩ��ʵĶ��㡣��ɭ�֣��Զ�ÿ��ʼ��ƽ��ݸ÷��

��ǿ�һ��ʹ��ɭ�ַ��ѡ�񣬲��ѡ��ǰ��׼ȷ�ԡ��ǽ�ʹ��Otto��ݼ��ݼ��ɴ�kaggle��ѻ�ã��Ҫע��kaggle��ظ��ݼ�)��Դ�https://www.kaggle.com/c/otto-group-product- classifics-challenge/data��ѵ��train.csv.zip��Ȼ�󽫽�ѹ��train.csv�ļ��Ĺ��Ŀ¼�С�

��ݼ��˳��61,000��Ʒ��93��ģ��ϸ�ڣ��Щ��Ʒ��ֳ�10��Ʒ��(��磬ʱ��ࡢ��Ӳ�Ʒ��)��ĳ��͵Ĳ�ͬ�¼��ļ��

ѵ��Ŀ��Ƕ��²�Ʒ��Ϊ10��ÿһ��ĸ��Ԥ�⣬��ʹ�ö༶��ʧ��Ҳ��Ϊ��أ��ģ�ͽ��

��ǽ��ӵ��п⿪ʼ:

1. #Import the supporting libraries

2. #Import pandas to load the dataset from csv file

3. from pandas import read_csv

4. #Import numpy for array based operations and calculations

5. import numpy as np

6. #Import Random Forest classifier class from sklearn

7. from sklearn.ensemble import RandomForestClassifier

8. #Import feature selector class select model of sklearn

9. from sklearn.feature_selection

10. import SelectFromModel

��һ��ڽ��ǵ��ݼ��Ϊѵ��ݺͲ��ݣ��ǽ��ѵ��ݲ��ֶ��ݼ��ѵ��ݲ��ֽ��ѵ��ģ�͵��:

1. #Function to create Train and Test set from the original dataset

2. def getTrainTestData(dataset,split):

4. training = []

5. testing = []

7. trainlength = np.uint16(np.floor(split*shape[0]))

8. for i in range(trainlength):

9. training.append(dataset[i])

10. for i in range(trainlength,shape[0]):

11. testing.append(dataset[i])

12. training = np.array(training) testing = np.array(testing)

13. return training,testing

��Ҫ��һ��ģ�͵�׼ȷ�ԣ��Ԥ��ʵ��Ϊ��룬��׼ȷ�ʰٷֱȣ�

1. #Function to evaluate model performance

2. def getAccuracy(pre,ytest):

3. count = 0

4. for i in range(len(ytest)):

5. if ytest[i]==pre[i]:

6. count+=1

7. acc = float(count)/len(ytest)

8. return acc

��Ҫ��ݼ��ǽ��train.csv�ļ��ļ��61,000��ѵ��ʵ��ǵ�ʾ��ʹ��50000��ʵ��ʹ��35,000��ʵ��ѵ��ʹ��15,000��ʵ��Է��:

1. #Load dataset as pandas data frame

2. data = read_csv('train.csv')

3. #Extract attribute names from the data frame

4. feat = data.keys()

5. feat_labels = feat.get_values()

6. #Extract data values from the data frame

7. dataset = data.values

8. #Shuffle the dataset

10. #We will select 50000 instances to train the classifier

11. inst = 50000

12.

13. #Extract 50000 instances from the dataset

14. dataset = dataset[0:inst,:]

15.

16. #Create Training and Testing data for performance evaluation

17. train,test = getTrainTestData(dataset, 0.7)

18.

19. #Split data into input and output variable with selected features

20. Xtrain = train[:,0:94] ytrain = train[:,94] shape = np.shape(Xtrain)

21.

22. print("Shape of the dataset ",shape)

23.

24. #Print the size of Data in MBs

25. print("Size of Data set before featureselection: %.2f MB"%(Xtrain.nbytes/1e6))

26.

ע��ݴ�С��ǵ��ݼ��Լ35000��ѵ��ʵ��94��ǵ��ݼ��ǳ��һ�£�

1. Shape of the dataset (35000, 94)

2. Size of Data set before feature selection: 26.32 MB

��ǵ��ݼ��35000�к�94�У��ݴ�С��26MB��

��һ��У��ǽ��ǵ��ɭ�ַ��ǻ�ʹ��250��Ϊ30��Ϊ7��sklearn��Ĭ��ֵ:

1. #Lets select the test data for model evaluation purpose

2. Xtest = test[:,0:94] ytest = test[:,94]

3.

4. #Create a random forest classifier with the following Parameters

5. trees = 250

6. max_feat= 7

7. max_depth = 30

8. min_sample = 2

9. clf = RandomForestClassifier(n_estimators=trees, max_features=max_feat, max_depth=max_depth, min_samples_split= min_sample, random_state=0, n_jobs=-1)

10.

11. #Train the classifier and calculate the training time

12. import time

13. start = time.time()

14. clf.fit(Xtrain, ytrain)

15. end = time.time()

16.

17. #Lets Note down the model training time

18. print("Execution time for building the Tree is: %f"%(float(end)- float(start)))

19. pre = clf.predict(Xtest)

20.

21. #Let's see how much time is required to train the model on the training dataset:

22. Execution time for building the Tree is: 2.913641

23.

24. #Evaluate the model performance for the test data

25. acc = getAccuracy(pre, ytest)

26.

27. print("Accuracy of model before feature selection is %.2f"%(100*acc))

ģ�͵ľ�ȷ��ǣ�

1. Accuracy of model before feature selection is 98.82

��ģ��ǻ��˷ǳ��õľ�ȷ�ȣ��Ϊ��ǽ��99%�Ĳ��ݷ��Ϊ��ȷ��ζ��15,000��ʵ��жԴ��14,823��ʵ��ȷ�ķ��ࡣ

��ԣ��ǣ��Ӧ�ý�һ��Ľ��𣿺ðɣ�Ϊʲô��أ��ܵĻ��һ��Ҫ��и��ĸĽ����ǽ�ʹ��Ҫ��ѡ��֪��Ľ��У��ʹ��ʶ��ѡ��ڵ㡣ѡ��ٵĲ��ֵ��Ϊ��еĽڵ㡣��ǿ��ʹ��Ƶı�׼��ѡ��ǿ��Ը��ʸ��ٵ��Ҫ�ȣ��ʹ��sklearn��feature_importances_��ʵ�֡��һ��ÿ��Ҫ��:

1. #Once we have trained the model we will rank all the features for feature in zip(feat_labels, clf.feature_importances_):

2. print(feature)

3. ('id', 0.33346650420175183)

14.

��㿴��ģ�ÿ��в�ͬ��Ҫ�ȣ��ȡ��Ԥ��Ĺ��ֵ��

��ǽ�ʹ��Щ��Ҫ��ǵ��;�ڽ��Ĳ��У��ǽ�ѡȡ��Ҫ�ȴ��0.01��ģ��ѵ��

1. #Select features which have higher contribution in the final prediction

2. sfm = SelectFromModel(clf, threshold=0.01)

3. sfm.fit(Xtrain,ytrain)

4.

����ǽ��ѡ��ת��ݼ��һ��У��ǻ�ת��ݼ��Ȼ��ǽ��ݼ��Ĵ�С��״:

1. #Transform input dataset

2. Xtrain_1 = sfm.transform(Xtrain)

3. Xtest_1 = sfm.transform(Xtest)

4.

5. #Let's see the size and shape of new dataset

6. print("Size of Data set before feature selection: %.2f MB"%(Xtrain_1.nbytes/1e6))

7. shape = np.shape(Xtrain_1)

8. print("Shape of the dataset ",shape)

9.

10. Size of Data set before feature selection: 5.60 MB

11. Shape of the dataset (35000, 20)

12.

��ݼ��״��𣿾��ѡ��ֻʣ��20��ʹ��ݿ�Ĵ�С��26MB��ٵ��5.60 MB��ԭ��ݼ��80%��ҡ�

��һ��У��ǽ�ʹ��ǰ��ͬ�ĳ��ѵ��һ��µ��ɭ�ַ��ڲ��Լ��Ͻ��˲��ԡ��޸�ѵ��õ��ľ�ȷ��Ƕ��٣�

1. #Model training time

2. start = time.time() clf.fit(Xtrain_1, ytrain) end = time.time()

3. print("Execution time for building the Tree is: %f"%(float(end)- float(start)))

4.

5. #Let's evaluate the model on test data

6. pre = clf.predict(Xtest_1) count = 0

7. acc2 = getAccuracy(pre, ytest)

8. print("Accuracy after feature selection %.2f"%(100*acc2))

9.

10. Execution time for building the Tree is: 1.711518

11. Accuracy after feature selection 99.97

12.

��ʹ��޸ĺ��ݼ��ǻ��99.97%��׼ȷ�ʣ��ζ��ǰ�14,996��ʵ��ֵ��ȷ��𣬶�֮ǰ��ֻ��ȷ�ط��14,823��ʵ��

��ѡ��ȡ�õľ޴��ǿ��Խ��еĽ��ܽ��±��

�ϱ��ʾ��ѡ��ʵ��ơ��Կ��ؼ��ģ�͵ĸ��Ժ��ݼ��ά�ȡ��ڼ�Сά�Ⱥ��Ҫ��ٵ�ѵ��ʱ�䣬��ǿ˷��˹��ϵ��⣬��˱��ǰ��ߵľ�ȷ�ȡ�

��ǹ�̽��˻��ѧϰ��ѡ��4�ַ��

��㷢��ƪ��º��ã��Ķ��ɻ��ѧϰ��һ�飬�˽��ڵ��ӷ��ĸ��Ϣ��

ԭ�ı��⣺

4 ways to implement feature selection in Python for machine learning

https://hub.packtpub.com/4-ways-implement-feature-selection-python-machine-learning/

�༭��ݼ

У�ԣ��

��߼��

����й��ۿƼ��ѧ��ѧ˶ʿ��ҵ��α��ʦ��ѧ�й��۽��ѧ��ѧԺ ��ݿ�ѧϵ��̡�ϲ��ݿ�ѧ��ϲ��Ķ��ϲ��о��ֹ��ϣ��һֱ��ѧϰ��״̬�Ͷ��Ȱ��ÿ�춼��ֶ��н��~

��: 2019-04-122019-04-12 19:02:43
ԭ��https://kuaibao.qq.com/s/20190412A0KLYV00?refer=cp_1026
��Ѷ��Ѷ�ƿ��Ѷ��ݿ��ƽ̨�ʺţ��ţ��֮һ��Ѷ��ݿ��ƽ̨��Э�顷ת�ط��ݡ�
��Ȩ��ϵ cloudcommunity@tencent.com ɾ��

��Ѷ

ɨ��

��վ�� Ⱥ

��ȡר�� 10Ԫ��ż�ȯ

˽�� ��ɻ�

ɨ��뿪��Ⱥ