Python sklearn.feature_selection.RFE Examples
The following are 22
code examples of sklearn.feature_selection.RFE().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_selection
, or try the search function
.
Example #1
Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0 | 7 votes |
def ReducedFeaturesDF(X,y): ''' Returns a dataframe with only a subset of features/columns retained ''' from sklearn.feature_selection import RFE est = LinearSVC( penalty='l1', loss='l2', dual=False, class_weight='auto') # selectK = SelectKBest(score_func = f_classif, k=45) selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15) selectK=selectRFE selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = feature_names[selectK_mask] print ("reduced RFE features:") print(K_featnames) Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] # Reduced_df.to_csv('REDUCED_Feat.csv') return Reduced_df # ReducedFeaturesDF(X,y) # z=pd.DataFrame(data=X_SGD,index=y) # z.to_csv('REDUCED_Feat.csv')
Example #2
Source File: train.py From 3D-Human-Body-Shape with MIT License | 6 votes |
def rfe_multiprocess(i, dets, deform, body_num, x, measure, k_features): sys.stdout.write('>> calc rfe map NO.%d\n'%(i)) y = np.array(dets).reshape(body_num, 1) model = LinearRegression() # recurcive feature elimination rfe = RFE(model, k_features) rfe.fit(x, y.ravel()) # mask.append(rfe.support_) flag = np.array(rfe.support_).reshape(utils.M_NUM, 1) flag = flag.repeat(body_num, axis=1) # calculte linear mapping mat S = np.array(deform) S.shape = (S.size, 1) m = np.array(measure[flag]) m.shape = (k_features, body_num) M = build_equation(m, 9) MtM = M.transpose().dot(M) MtS = M.transpose().dot(S) ans = np.array(scipy.sparse.linalg.spsolve(MtM, MtS)) ans.shape = (9, k_features) return [ans, rfe.support_]
Example #3
Source File: FeatureSelector.py From FAE with GNU General Public License v3.0 | 6 votes |
def GetSelectedFeatureIndex(self, data_container): data = data_container.GetArray() data /= np.linalg.norm(data, ord=2, axis=0) label = data_container.GetLabel() if data.shape[1] < self.GetSelectedFeatureNumber(): print('RFE: The number of features {:d} in data container is smaller than the required number {:d}'.format( data.shape[1], self.GetSelectedFeatureNumber())) self.SetSelectedFeatureNumber(data.shape[1]) fs = RFE(self.__classifier, self.GetSelectedFeatureNumber(), step=0.05) fs.fit(data, label) feature_index = fs.get_support(True) self._rank = fs.ranking_ return feature_index.tolist()
Example #4
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 6 votes |
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True): ''' Gets best features using chosen method (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr), then prints top K features' names (from featNames). If reduceMatrix = True, then also returns X reduced to the K best features. Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'.. Note, that effectiveyl, Any scikit learn method could be used, if correctly imported.. ''' #est = method() ''' Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented). Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv" ''' features, labels, lb_encoder,featureNames = load_data(filename) X, y = features, labels # change the names as ints back to strings class_names=lb_encoder.inverse_transform(y) print("Data and labels imported. PreFilter Feature matrix shape:") print(X.shape) selectK = SelectKBest(k=kbest) selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = featureNames[selectK_mask] print('X After K filter:',X.shape) print("K_featnames: %s" %(K_featnames)) if reduceMatrix ==True : Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] Reduced_df.to_csv('REDUCED_Feat.csv') print('Saved to REDUCED_Feat.csv') return Reduced_df #WORKS! But unreadable with too many features!
Example #5
Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.feature_selection.GenericUnivariateSelect, fs.GenericUnivariateSelect) self.assertIs(df.feature_selection.SelectPercentile, fs.SelectPercentile) self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest) self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr) self.assertIs(df.feature_selection.SelectFromModel, fs.SelectFromModel) self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr) self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe) self.assertIs(df.feature_selection.RFE, fs.RFE) self.assertIs(df.feature_selection.RFECV, fs.RFECV) self.assertIs(df.feature_selection.VarianceThreshold, fs.VarianceThreshold)
Example #6
Source File: feature_selection.py From default-credit-card-prediction with MIT License | 6 votes |
def rfe_selection(X,y,n_features): """ Performs the Recursive Feature Elimination method and selects the top ranking features Keyword arguments: X -- The feature vectors y -- The target vector n_features -- n best ranked features """ if verbose: print '\nPerforming Feature Selection based on the Recursive Feature Elimination method ...' clf=RandomForestClassifierWithCoef(n_estimators=10,n_jobs=-1) fs= RFE(clf, n_features, step=1) fs= fs.fit(X,y) ranks=fs.ranking_ feature_indexes=[] for i in xrange(len(ranks)): if ranks[i]==1: feature_indexes+=[i] return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features] #return selected features and original index features
Example #7
Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0 | 5 votes |
def compute_ranks(self): if self._algorithm == FeatureSelector.SELECT_K_BEST: scores = self._selector.scores_ sorted_scores = sorted(scores, reverse=True) ranks = [sorted_scores.index(i) + 1 for i in scores] elif self._algorithm == FeatureSelector.SELECT_PERCENTILE: scores = self._selector.scores_ sorted_scores = sorted(scores, reverse=True) ranks = [sorted_scores.index(i) + 1 for i in scores] elif self._algorithm == FeatureSelector.RECURSIVE_ELIMINATION: n_selected = self._selector.n_features_ support = self._selector.support_ ranking = self._selector.ranking_ # RFE and RFECV do not provide feature scores. Instead, they # provide a list of features which have been selected (support) # and an ascending list indicating when each other feature was # eliminated. Use these two to construct feature ranks, though # acknowledge that RFE and RFECV do not actually distinguish between # the weights of selected features. ranks = [0]*len(support) selected_count = 0 for i in range(len(ranking)): if support[i]: # All selected features in ranking receive rank 1, so need # to iterate through list and add incrementing values so # that features ranked 1, 1, 1, become 1, 2, 3. ranks[i] = ranking[i] + selected_count selected_count += 1 else: # Even if there are 5 selected features, the 6th feature # in ranking is given rank 2, so add (n_selected - 1). ranks[i] = ranking[i] + (n_selected - 1) return ranks
Example #8
Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0 | 5 votes |
def plot_RFE(X,y): from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV from sklearn.datasets import make_classification from sklearn.metrics import zero_one_loss import pylab as pl import matplotlib.pylab as pl # Create the RFE object and compute a cross-validated score. # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True) svc=LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto',multi_class='ovr') # SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15) ## rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc') rfecv = RFECV(estimator=svc, step=0.2,cv=StratifiedKFold(y, 2), scoring='f1') X_RFE = rfecv.fit_transform(X, y) print("Optimal number of features in X_RFE : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation score (nb of misclassifications)") pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) pl.show() print ('RFE Opt.shapes features CV score:') CV_multi_stats(X_RFE,y,svc) return (X_RFE,rfecv)
Example #9
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 5 votes |
def plotRFECV (X,y,stepSize=0.05,scoring='f1'): ''' Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation. http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py ''' from sklearn.svm import SVC from sklearn.cross_validation import StratifiedKFold from sklearn.feature_selection import RFECV # Create the RFE object and compute a cross-validated score. # svc = SVC(kernel="linear") svc = SVC(kernel="linear",class_weight='auto', cache_size=1400) # The "accuracy" scoring is proportional to the number of correct # classifications rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2), scoring=scoring) rfecv.fit(X, y) print("Optimal number of features : %d" % rfecv.n_features_) # Plot number of features VS. cross-validation scores import matplotlib.pyplot as plt plt.figure() plt.xlabel("Number of features selected") plt.ylabel("Cross validation score (nb of correct classifications)") plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_) plt.show() return rfecv
Example #10
Source File: wine.py From UCI-Data-Analysis with Apache License 2.0 | 5 votes |
def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features): rfe = RFE(classifier_model,number_of_features) transformed_train_data = rfe.fit_transform(train_data,train_labels) transformed_test_data = rfe.transform(test_data) return transformed_train_data,transformed_test_data #Defines the recursive feature selector for choosing the best feature using Cross Validation
Example #11
Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0 | 5 votes |
def _eliminate_recursively(self, k=None): if self._problem == FeatureSelector.CLASSIFICATION: estimator = RandomForestClassifier(random_state=self._random_state) else: estimator = LassoCV(random_state=self._random_state) # If k is not specified, then use RFECV to automatically decide on # optimal number of features. If specified, then use RFE. if k is None: self._selector = RFECV(estimator) else: self._selector = RFE(estimator, n_features_to_select=k, step=0.05)
Example #12
Source File: FeatureSelector.py From FAE with GNU General Public License v3.0 | 5 votes |
def __init__(self, selected_feature_number=1, classifier=SVC(kernel='linear')): super(FeatureSelectByRFE, self).__init__(name='RFE', selected_feature_number=selected_feature_number) self.__classifier = classifier self._rank = None self._selected_features = []
Example #13
Source File: classifiers.py From Sarcasm-Detection with MIT License | 5 votes |
def feature_selection(x_train, y_train, x_test, y_test): print("Feature selection with LinearSVC") model = LinearSVC(C=0.1, penalty='l2') rfe = RFE(model, 5) best_features_model = rfe.fit(x_train, y_train) y_hat = best_features_model.predict(x_test) utils.print_statistics(y_test, y_hat)
Example #14
Source File: rfe.py From lale with Apache License 2.0 | 5 votes |
def __init__(self, estimator, n_features_to_select=None, step=1, verbose=0): self._hyperparams = { 'estimator': estimator, 'n_features_to_select': n_features_to_select, 'step': step, 'verbose': verbose} self._wrapped_model = SKLModel(**self._hyperparams)
Example #15
Source File: ABIDEParser.py From population-gcn with GNU General Public License v3.0 | 5 votes |
def feature_selection(matrix, labels, train_ind, fnum): """ matrix : feature matrix (num_subjects x num_features) labels : ground truth labels (num_subjects x 1) train_ind : indices of the training samples fnum : size of the feature vector after feature selection return: x_data : feature matrix of lower dimension (num_subjects x fnum) """ estimator = RidgeClassifier() selector = RFE(estimator, fnum, step=100, verbose=1) featureX = matrix[train_ind, :] featureY = labels[train_ind] selector = selector.fit(featureX, featureY.ravel()) x_data = selector.transform(matrix) print("Number of labeled samples %d" % len(train_ind)) print("Number of features selected %d" % x_data.shape[1]) return x_data # Make sure each site is represented in the training set when selecting a subset of the training set
Example #16
Source File: modeling.py From kddcup2015 with GNU General Public License v2.0 | 5 votes |
def lr_with_fs(): """ Submission: lr_with_fs_0620_02.csv E_val: <missing> E_in: 0.856252488379 E_out: 0.8552577388980213 """ from sklearn.linear_model import LogisticRegressionCV from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1) clf.fit(X_new, y) print(auc_score(clf, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('lr', clf)]), 'lr_with_fs_0620_02')
Example #17
Source File: model_recommendation.py From DIVE-backend with GNU General Public License v3.0 | 5 votes |
def recursive_feature_elimination(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5): considered_independent_variables_per_model, patsy_models = \ construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value) y, X = dmatrices(patsy_models[0], df, return_type='dataframe') estimator = SVR(kernel='linear') selector = RFE(estimator, 5, step=1) selector = selector.fit(X, y) logger.info(selector.support_) logger.info(selector.ranking_) return
Example #18
Source File: model_recommendation.py From DIVE-backend with GNU General Public License v3.0 | 5 votes |
def get_initial_regression_model_recommendation(project_id, dataset_id, dependent_variable_id=None, recommendation_type=MRT.LASSO.value, table_layout=MCT.LEAVE_ONE_OUT.value, data_size_cutoff=current_app.config['ANALYSIS_DATA_SIZE_CUTOFF'], categorical_value_limit=current_app.config['ANALYSIS_CATEGORICAL_VALUE_LIMIT']): df = get_data(project_id=project_id, dataset_id=dataset_id) if len(df) > data_size_cutoff: df = df.sample(data_size_cutoff) field_properties = db_access.get_field_properties(project_id, dataset_id) quantitative_field_properties = [ fp for fp in field_properties if fp['general_type'] == 'q'] dependent_variable = next((f for f in field_properties if f['id'] == dependent_variable_id), None) \ if dependent_variable_id \ else np.random.choice(quantitative_field_properties, size=1)[0] independent_variables = [] for fp in field_properties: if (fp['name'] != dependent_variable['name']): if (fp['general_type'] == 'c' and (fp['is_unique'] or len(fp['unique_values']) > categorical_value_limit)): continue independent_variables.append(fp) recommendationTypeToFunction = { MRT.FORWARD_R2.value: forward_r2, MRT.LASSO.value: lasso, MRT.RFE.value: recursive_feature_elimination, MRT.FORWARD_F.value: f_regression } result = recommendationTypeToFunction[recommendation_type](df, dependent_variable, independent_variables) return { 'recommended': True, 'table_layout': table_layout, 'recommendation_type': recommendation_type, 'dependent_variable_id': dependent_variable['id'], 'independent_variables_ids': [ x['id'] for x in result ], }
Example #19
Source File: FeatureSelector.py From FAE with GNU General Public License v3.0 | 5 votes |
def GetDescription(self): text = "Before build the model, we used recursive feature elimination (RFE) to select features. The goal of RFE " \ "is to select features based on a classifier by recursively considering smaller set of the features. " return text
Example #20
Source File: c10.py From abu with GNU General Public License v3.0 | 4 votes |
def sample_1033_2(): """ 10.3.3 特征的重要性排序及支持度评级 :return: """ global g_with_date_week_noise g_with_date_week_noise = True train_x, train_y_regress, train_y_classification, pig_three_feature, \ test_x, test_y_regress, test_y_classification, kl_another_word_feature_test = sample_1031_1() # noinspection PyShadowingNames def importances_coef_pd(estimator): """ 特征的重要性 """ if hasattr(estimator, 'feature_importances_'): # 有feature_importances_的通过sort_values排序 return pd.DataFrame( {'feature': list(pig_three_feature.columns[1:]), 'importance': estimator.feature_importances_}).sort_values('importance') elif hasattr(estimator, 'coef_'): # 有coef_的通过coef排序 return pd.DataFrame( {"columns": list(pig_three_feature.columns)[1:], "coef": list(estimator.coef_.T)}).sort_values('coef') else: print('estimator not hasattr feature_importances_ or coef_!') # 使用随机森林分类器 from sklearn.ensemble import RandomForestClassifier estimator = RandomForestClassifier(n_estimators=100) # 训练数据模型 estimator.fit(train_x, train_y_classification) # 对训练后的模型特征的重要度进行判定,重要程度由小到大,表10-4所示 print('importances_coef_pd(estimator):\n', importances_coef_pd(estimator)) from sklearn.feature_selection import RFE # noinspection PyShadowingNames def feature_selection(estimator, x, y): """ 支持度评级 """ selector = RFE(estimator) selector.fit(x, y) print('RFE selection') print(pd.DataFrame( {'support': selector.support_, 'ranking': selector.ranking_}, index=pig_three_feature.columns[1:])) print('feature_selection(estimator, train_x, train_y_classification):\n', feature_selection(estimator, train_x, train_y_classification))
Example #21
Source File: modeling.py From kddcup2015 with GNU General Public License v2.0 | 4 votes |
def sgd(): """ Submission: sgd_0620_03.csv E_val: 0.863628 E_in: 0.854373 E_out: """ from sklearn.linear_model import SGDClassifier from sklearn.preprocessing import StandardScaler from sklearn.pipeline import Pipeline from sklearn.grid_search import GridSearchCV from sklearn.cross_validation import StratifiedKFold X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47')) y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47')) raw_scaler = StandardScaler() raw_scaler.fit(X) X_scaled = raw_scaler.transform(X) rfe = util.fetch(util.cache_path('feature_selection.RFE.21')) X_pruned = rfe.transform(X_scaled) new_scaler = StandardScaler() new_scaler.fit(X_pruned) X_new = new_scaler.transform(X_pruned) sgd = SGDClassifier(n_iter=50, n_jobs=-1) params = { 'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive', 'squared_epsilon_insensitive'] } grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5), scoring='roc_auc', n_jobs=-1) grid.fit(X_new, y) logger.debug('Best score (E_val): %f', grid.best_score_) sgd = grid.best_estimator_ logger.debug('E_in: %f', auc_score(sgd, X_new, y)) to_submission(Pipeline([('scale_raw', raw_scaler), ('rfe', rfe), ('scale_new', new_scaler), ('sgd', sgd)]), 'sgd_0620_03')
Example #22
Source File: genericmodelclass.py From easyML with BSD 3-Clause "New" or "Revised" License | 4 votes |
def recursive_feature_elimination(self, nfeat=None, step=1, inplace=False): """A method to implement recursive feature elimination on the model. Note that CV is not performed in this function. The method will continue to eliminate some features (specified by step parameter) at each step until the specified number of features are reached. Parameters __________ nfeat : int or None, default=None The num of top features to select. If None, half of the features are selected. step : int or float, default=1 If int, then step corresponds to the number of features to remove at each iteration. If float and within (0.0, 1.0), then step corresponds to the percentage (rounded down) of features to remove at each iteration. If float and greater than one, then integral part will be considered as an integer input inplace : bool, default=False If True, the predictors of the class are modified to those selected by the RFE procedure. Returns _______ selected : A series object containing the selected features as index and their rank in selection as values """ rfe = RFE(self.alg, n_features_to_select=nfeat, step=step) rfe.fit( self.datablock.train[self.predictors], self.datablock.train[self.datablock.target] ) ranks = pd.Series(rfe.ranking_, index=self.predictors) selected = ranks.loc[rfe.support_] if inplace: self.set_predictors(selected.index.tolist()) return selected