Python sklearn.feature_selection.SelectPercentile() Examples
The following are 17
code examples of sklearn.feature_selection.SelectPercentile().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_selection
, or try the search function
.
Example #1
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 6 votes |
def test_export_pipeline(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline.""" pipeline_string = ( 'KNeighborsClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)), SelectPercentile(score_func=f_classif, percentile=20) ), KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
Example #2
Source File: GetMLPara.py From dr_droid with Apache License 2.0 | 6 votes |
def find_best_feature_selections(X,y): #select the best features usin different technique X_new = SelectKBest(chi2, k=80).fit_transform(X,y) X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y) X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y) X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y) X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y) print (X_new.shape) #selection_parameters_for_classfier(X_new,y) #print (y.shape) train_and_test(X_new,y) train_and_test(X_new1,y) train_and_test(X_new2,y) train_and_test(X_new22,y) train_and_test(X_new3,y) train_and_test(X_new4,y) #X,y = _dataset_sample() ################################PARAMETER Selected################################ #TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest
Example #3
Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.feature_selection.GenericUnivariateSelect, fs.GenericUnivariateSelect) self.assertIs(df.feature_selection.SelectPercentile, fs.SelectPercentile) self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest) self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr) self.assertIs(df.feature_selection.SelectFromModel, fs.SelectFromModel) self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr) self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe) self.assertIs(df.feature_selection.RFE, fs.RFE) self.assertIs(df.feature_selection.RFECV, fs.RFECV) self.assertIs(df.feature_selection.VarianceThreshold, fs.VarianceThreshold)
Example #4
Source File: text_models.py From mindmeld with Apache License 2.0 | 6 votes |
def _get_feature_selector(self): """Get a feature selector instance based on the feature_selector model parameter Returns: (Object): a feature selector which returns a reduced feature matrix, \ given the full feature matrix, X and the class labels, y """ if self.config.model_settings is None: selector_type = None else: selector_type = self.config.model_settings.get("feature_selector") selector = { "l1": SelectFromModel(LogisticRegression(penalty="l1", C=1)), "f": SelectPercentile(), }.get(selector_type) return selector
Example #5
Source File: feature_selection.py From dataiku-contrib with Apache License 2.0 | 5 votes |
def univariate_feature_selection(mode,predictors,target): if mode == 'f_regression': fselect = SelectPercentile(f_regression, 100) if mode == 'f_classif': fselect = SelectPercentile(f_classif, 100) if mode == 'chi2': fselect = SelectPercentile(chi2, 100) fselect.fit_transform(predictors, target) return fselect.pvalues_
Example #6
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 5 votes |
def test_export_pipeline_3(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor.""" pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
Example #7
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 5 votes |
def test_export_pipeline_6(): """Assert that exported_pipeline() generated a compile source file with random_state and data_file_path.""" pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier from tpot.export_utils import set_param_recursive # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('test_path', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=42) exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) # Fix random state for all the steps in exported pipeline set_param_recursive(exported_pipeline.steps, 'random_state', 42) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ exported_code = export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, random_state=42, data_file_path='test_path') assert expected_code == exported_code
Example #8
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 5 votes |
def test_operator_export(): """Assert that a TPOT operator can export properly with a callable function as a parameter.""" assert list(TPOTSelectPercentile.arg_types) == TPOTSelectPercentile_args export_string = TPOTSelectPercentile.export(5) assert export_string == "SelectPercentile(score_func=f_classif, percentile=5)"
Example #9
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 5 votes |
def test_get_by_name(): """Assert that the Operator class returns operators by name appropriately.""" assert get_by_name("SelectPercentile", tpot_obj.operators).__class__ == TPOTSelectPercentile.__class__ assert get_by_name("SelectFromModel", tpot_obj.operators).__class__ == TPOTSelectFromModel.__class__
Example #10
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 5 votes |
def test_pipeline_score_save(): """Assert that the TPOTClassifier can generate a scored pipeline export correctly.""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.929813743 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
Example #11
Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0 | 5 votes |
def _select_percentile(self, percentile): # Algorithm is conservative. Defaults to keeping features if # percentile specifies a value that corresponds to a floating number # of features. For example, if percentile=18 on a 20-feature matrix # implies keeping 3.6 features. In that case, keeps 4 features. if self._problem == FeatureSelector.CLASSIFICATION: score = f_classif else: score = f_regression self._selector = SelectPercentile(score, percentile)
Example #12
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 5 votes |
def PlotPerfPercentFeatures(X,y,est=LinearSVC()): ''' Performance of a classifier (default: SVM-Anova) varying the percentile of features selected (F-test) . http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py See Also: (Similar but with model seelction from among classifiers): http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb ''' transform = SelectPercentile(f_classif) clf = Pipeline([('anova', transform), ('est', est)]) ############################################################################### # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99) # percentiles = (1,5,10,25,50,75,90) for percentile in percentiles: # print(percentile) clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) print("Outputting Graph:") plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Predictor Performance, varying percent of features used') plt.xlabel('Percentile') plt.ylabel('Prediction Performance') plt.axis('tight') plt.show()
Example #13
Source File: memm.py From mindmeld with Apache License 2.0 | 5 votes |
def _get_feature_selector(selector_type): """Get a feature selector instance based on the feature_selector model parameter. Returns: (Object): A feature selector which returns a reduced feature matrix, \ given the full feature matrix, X and the class labels, y. """ selector = { "l1": SelectFromModel(LogisticRegression(penalty="l1", C=1)), "f": SelectPercentile(), }.get(selector_type) return selector
Example #14
Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0 | 5 votes |
def plot_BestKFeatures (X_train, y_train): ''' http://nbviewer.ipython.org/github/gmonce/scikit-learn-book/blob/master/Chapter%204%20-%20Advanced%20Features%20-%20Feature%20Engineering%20and%20Selection.ipynb Find the best percentile of features to use, using cross-validation on the training set and get K best feats ''' from sklearn import cross_validation from sklearn import feature_selection from sklearn import tree dt = tree.DecisionTreeClassifier(criterion='entropy') dt = RandomForestClassifier(n_jobs=2, bootstrap=True, n_estimators=250, criterion='gini') dt = dt.fit(X_train, y_train) percentiles = range(1, 95, 5) results = [] for i in range(1, 95, 5): fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) #Original fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=i) # alt X_train_fs = fs.fit_transform(X_train, y_train) scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=4) #print i,scores.mean() results = np.append(results, scores.mean()) optimal_percentil = np.where(results == results.max())[0] print (("Optimal number of features:{0}".format(percentiles[optimal_percentil])), "\n") # Plot number of features VS. cross-validation scores import pylab as pl import matplotlib.pylab as pl pl.figure() pl.xlabel("Number of features selected") pl.ylabel("Cross validation accuracy)") pl.plot(percentiles,results) print ("Mean scores:",results) return
Example #15
Source File: email_preprocess.py From machine-learning with GNU General Public License v3.0 | 4 votes |
def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl", percentile=10): """ this function takes a pre-made list of email texts (by default word_data.pkl) and the corresponding authors (by default email_authors.pkl) and performs a number of preprocessing steps: -- splits into training/testing sets (10% testing) -- vectorizes into tfidf matrix -- selects/keeps most helpful features after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions 4 objects are returned: -- training/testing features -- training/testing labels """ ### the words (features) and authors (labels), already largely preprocessed ### this preprocessing will be repeated in the text learning mini-project authors_file_handler = open(authors_file, "r") authors = pickle.load(authors_file_handler) authors_file_handler.close() words_file_handler = open(words_file, "r") word_data = cPickle.load(words_file_handler) words_file_handler.close() ### test_size is the percentage of events assigned to the test set (remainder go into training) features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42) ### text vectorization--go from strings to lists of numbers vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words='english') features_train_transformed = vectorizer.fit_transform(features_train) features_test_transformed = vectorizer.transform(features_test) ### feature selection, because text is super high dimensional and ### can be really computationally chewy as a result selector = SelectPercentile(f_classif, percentile=percentile) selector.fit(features_train_transformed, labels_train) features_train_transformed = selector.transform(features_train_transformed).toarray() features_test_transformed = selector.transform(features_test_transformed).toarray() ### info on the data print "no. of Chris training emails:", sum(labels_train) print "no. of Sara training emails:", len(labels_train)-sum(labels_train) return features_train_transformed, features_test_transformed, labels_train, labels_test
Example #16
Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0 | 4 votes |
def call_GridParamSearch_featfilt(X, y) : ''' (def is Currently just a cut & paste from "main".) Calles def GridParamSearch , (which uses randomized CV to find odel param) Used to try different ml models, then get their optimal paramters ''' print("SPARSE (L1) EXT gridparam scores:") # clf = Pipeline([ # ('feature_selection', LinearSVC(penalty="l1", loss='l1',dual=False, class_weight='auto')), # ('classification', ExtraTreesClassifier(n_jobs=3) # )]) 'Sparse; L1 penalized features selection prior to RF fitting/prediction' clf_svm = LinearSVC(penalty="l1", loss='l2', dual=False, class_weight='auto') clf_logit = LogisticRegression(penalty="l1", dual=False, class_weight='auto') 'http://scikit-learn.org/0.13/auto_examples/plot_feature_selection.html' print('Original features matrix:') print(X.shape) # Univariate feature selection with F-test for feature scoring # We use the default selection function: the 20% most significant features # selector = SelectPercentile(f_classif, percentile=20) selector = SelectPercentile(chi2, percentile=20) X_anova = selector.fit_transform(X, y) print( 'New (2 f_classif) Using statistical feature selection: features matrix is:') print(X_anova.shape) # lda = LDA(n_components=10) # X_lda = lda.fit_transform(X, y) # print('New LDA filtered features matrix:') # print(X_lda.shape) X_svm = clf_svm.fit_transform(X, y) #Get Sparse feature selections.. # print(clf.feature_importances_ ) print('New sparse (SVM filtered) features matrix:') print(X_svm.shape) print("Res of SVM fitting of (F scores filtered =2) for more feature selection:") X_doubleFilt_svm_f = clf_svm.fit_transform(X_anova, y) print(X_doubleFilt_svm_f.shape) print("param search on sparse features matrix") GridParamSearch(param_dist=Tree_param_dist, clf=clf_EXT, X=X_svm, y=y)
Example #17
Source File: feature_selection.py From ecg-classification with GNU General Public License v3.0 | 4 votes |
def run_feature_selection(features, labels, feature_selection, best_features): if feature_selection == 'select_K_Best': # feature extraction selector = SelectKBest(score_func=f_classif, k=4) # score_func=chi2 : only for non-negative features selector.fit(features, labels) # summarize scores scores = selector.scores_ features_index_sorted = np.argsort(-scores) features_selected = features[:, features_index_sorted[0:best_features]] # SelectFromModel and LassoCV # We use the base estimator LassoCV since the L1 norm promotes sparsity of features. if feature_selection == 'LassoCV': clf = LassoCV() # Set a minimum threshold of 0.25 sfm = SelectFromModel(clf, threshold=0.95) sfm.fit(features, labels) features_selected = sfm.transform(features).shape[1] """ # Reset the threshold till the number of features equals two. # Note that the attribute can be set directly instead of repeatedly # fitting the metatransformer. while n_features > 2: sfm.threshold += 0.1 X_transform = sfm.transform(X) n_features = X_transform.shape[1] """ # Univariate feature selection # Univariate feature selection works by selecting the best features based on univariate statistical tests. # It can be seen as a preprocessing step to an estimator. # Scikit-learn exposes feature selection routines as objects that implement the transform method: # - SelectKBest removes all but the k highest scoring features # - SelectPercentile removes all but a user-specified highest scoring percentage of features # common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe. # - GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator. if feature_selection == 'slct_percentile': selector = SelectPercentile(f_classif, percentile=10) selector.fit(features, labels) # The percentile not affect. # Just select in order the top features by number or threshold # Keep best 8 values? scores = selector.scores_ features_index_sorted = np.argsort(-scores) # scores = selector.scores_ # scores = -np.log10(selector.pvalues_) # scores /= scores.max() features_selected = features[:, features_index_sorted[0:best_features]] print("Selected only " + str(features_selected.shape) + " features ") return features_selected, features_index_sorted