Python Examples of sklearn.feature_selection.SelectPercentile

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

6 votes

def test_export_pipeline():
    """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline."""

    pipeline_string = (
        'KNeighborsClassifier(CombineDFs('
        'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, '
        'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
        'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform'
    )

    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'], random_state=None)

exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)),
        SelectPercentile(score_func=f_classif, percentile=20)
    ),
    KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)

Source File: GetMLPara.py From dr_droid with Apache License 2.0

6 votes

def find_best_feature_selections(X,y):

    #select the best features usin different technique
    X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
    X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)

    X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
    X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)

    X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
    X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)

    print (X_new.shape)
    #selection_parameters_for_classfier(X_new,y)
    #print (y.shape)
    train_and_test(X_new,y)
    train_and_test(X_new1,y)
    train_and_test(X_new2,y)
    train_and_test(X_new22,y)
    train_and_test(X_new3,y)
    train_and_test(X_new4,y)
    #X,y = _dataset_sample()

################################PARAMETER  Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest

Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_selection.GenericUnivariateSelect,
                      fs.GenericUnivariateSelect)
        self.assertIs(df.feature_selection.SelectPercentile,
                      fs.SelectPercentile)
        self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest)
        self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr)
        self.assertIs(df.feature_selection.SelectFromModel,
                      fs.SelectFromModel)
        self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr)
        self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe)
        self.assertIs(df.feature_selection.RFE, fs.RFE)
        self.assertIs(df.feature_selection.RFECV, fs.RFECV)
        self.assertIs(df.feature_selection.VarianceThreshold,
                      fs.VarianceThreshold)

Source File: text_models.py From mindmeld with Apache License 2.0

6 votes

def _get_feature_selector(self):
        """Get a feature selector instance based on the feature_selector model
        parameter

        Returns:
            (Object): a feature selector which returns a reduced feature matrix, \
                given the full feature matrix, X and the class labels, y
        """
        if self.config.model_settings is None:
            selector_type = None
        else:
            selector_type = self.config.model_settings.get("feature_selector")
        selector = {
            "l1": SelectFromModel(LogisticRegression(penalty="l1", C=1)),
            "f": SelectPercentile(),
        }.get(selector_type)
        return selector

Source File: feature_selection.py From dataiku-contrib with Apache License 2.0

5 votes

def univariate_feature_selection(mode,predictors,target):
    
    if mode == 'f_regression':
        fselect = SelectPercentile(f_regression, 100)
        
    if mode == 'f_classif':
        fselect = SelectPercentile(f_classif, 100)
        
    if mode == 'chi2':
        fselect = SelectPercentile(chi2, 100)
        
    fselect.fit_transform(predictors, target)
    
    return fselect.pvalues_

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

5 votes

def test_export_pipeline_3():
    """Assert that exported_pipeline() generated a compile source file as expected given a fixed simple pipeline with a preprocessor."""

    pipeline_string = (
        'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
        'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
        'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
    )
    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)

    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'], random_state=None)

exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=20),
    DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

5 votes

def test_export_pipeline_6():
    """Assert that exported_pipeline() generated a compile source file with random_state and data_file_path."""

    pipeline_string = (
        'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
        'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
        'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
    )
    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier
from tpot.export_utils import set_param_recursive

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('test_path', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'], random_state=42)

exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=20),
    DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)
# Fix random state for all the steps in exported pipeline
set_param_recursive(exported_pipeline.steps, 'random_state', 42)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    exported_code = export_pipeline(pipeline, tpot_obj.operators,
                                    tpot_obj._pset, random_state=42,
                                    data_file_path='test_path')

    assert expected_code == exported_code

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

5 votes

def test_operator_export():
    """Assert that a TPOT operator can export properly with a callable function as a parameter."""
    assert list(TPOTSelectPercentile.arg_types) == TPOTSelectPercentile_args
    export_string = TPOTSelectPercentile.export(5)
    assert export_string == "SelectPercentile(score_func=f_classif, percentile=5)"

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

5 votes

def test_get_by_name():
    """Assert that the Operator class returns operators by name appropriately."""

    assert get_by_name("SelectPercentile", tpot_obj.operators).__class__ == TPOTSelectPercentile.__class__
    assert get_by_name("SelectFromModel", tpot_obj.operators).__class__ == TPOTSelectFromModel.__class__

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

5 votes

def test_pipeline_score_save():
    """Assert that the TPOTClassifier can generate a scored pipeline export correctly."""
    tpot_obj = TPOTClassifier()
    tpot_obj._fit_init()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline_string = (
        'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
        'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
        'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
    )
    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.929813743
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=20),
    DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))

Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0

5 votes

def _select_percentile(self, percentile):
        # Algorithm is conservative. Defaults to keeping features if
        # percentile specifies a value that corresponds to a floating number
        # of features. For example, if percentile=18 on a 20-feature matrix
        # implies keeping 3.6 features. In that case, keeps 4 features.
        if self._problem == FeatureSelector.CLASSIFICATION:
            score = f_classif
        else:
            score = f_regression

        self._selector = SelectPercentile(score, percentile)

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

5 votes

def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py

    See Also: (Similar but with model seelction from among classifiers):
    http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb

    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show()

Source File: memm.py From mindmeld with Apache License 2.0

5 votes

def _get_feature_selector(selector_type):
        """Get a feature selector instance based on the feature_selector model
        parameter.

        Returns:
            (Object): A feature selector which returns a reduced feature matrix, \
                given the full feature matrix, X and the class labels, y.
        """
        selector = {
            "l1": SelectFromModel(LogisticRegression(penalty="l1", C=1)),
            "f": SelectPercentile(),
        }.get(selector_type)
        return selector

Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0

5 votes

def plot_BestKFeatures (X_train, y_train):
    '''
    http://nbviewer.ipython.org/github/gmonce/scikit-learn-book/blob/master/Chapter%204%20-%20Advanced%20Features%20-%20Feature%20Engineering%20and%20Selection.ipynb
    Find the best percentile of features to use,
    using cross-validation on the training set and get K best feats
    '''
    from sklearn import cross_validation
    from sklearn import feature_selection
    from sklearn import tree
    dt = tree.DecisionTreeClassifier(criterion='entropy')
    dt = RandomForestClassifier(n_jobs=2, bootstrap=True, n_estimators=250, criterion='gini')
    dt = dt.fit(X_train, y_train)

    percentiles = range(1, 95, 5)
    results = []
    for i in range(1, 95, 5):
        fs = feature_selection.SelectPercentile(feature_selection.chi2, percentile=i) #Original
        fs = feature_selection.SelectPercentile(feature_selection.f_classif, percentile=i) # alt
        X_train_fs = fs.fit_transform(X_train, y_train)
        scores = cross_validation.cross_val_score(dt, X_train_fs, y_train, cv=4)
        #print i,scores.mean()
        results = np.append(results, scores.mean())

    optimal_percentil = np.where(results == results.max())[0]
    print (("Optimal number of features:{0}".format(percentiles[optimal_percentil])), "\n")

    # Plot number of features VS. cross-validation scores
    import pylab as pl
    import matplotlib.pylab as pl
    pl.figure()
    pl.xlabel("Number of features selected")
    pl.ylabel("Cross validation accuracy)")
    pl.plot(percentiles,results)
    print ("Mean scores:",results)
    return

Source File: email_preprocess.py From machine-learning with GNU General Public License v3.0

4 votes

def preprocess(words_file = "../tools/word_data.pkl", authors_file="../tools/email_authors.pkl", percentile=10):
    """ 
        this function takes a pre-made list of email texts (by default word_data.pkl)
        and the corresponding authors (by default email_authors.pkl) and performs
        a number of preprocessing steps:
            -- splits into training/testing sets (10% testing)
            -- vectorizes into tfidf matrix
            -- selects/keeps most helpful features

        after this, the feaures and labels are put into numpy arrays, which play nice with sklearn functions

        4 objects are returned:
            -- training/testing features
            -- training/testing labels

    """

    ### the words (features) and authors (labels), already largely preprocessed
    ### this preprocessing will be repeated in the text learning mini-project
    authors_file_handler = open(authors_file, "r")
    authors = pickle.load(authors_file_handler)
    authors_file_handler.close()

    words_file_handler = open(words_file, "r")
    word_data = cPickle.load(words_file_handler)
    words_file_handler.close()

    ### test_size is the percentage of events assigned to the test set (remainder go into training)
    features_train, features_test, labels_train, labels_test = cross_validation.train_test_split(word_data, authors, test_size=0.1, random_state=42)



    ### text vectorization--go from strings to lists of numbers
    vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5,
                                 stop_words='english')
    features_train_transformed = vectorizer.fit_transform(features_train)
    features_test_transformed  = vectorizer.transform(features_test)



    ### feature selection, because text is super high dimensional and 
    ### can be really computationally chewy as a result
    selector = SelectPercentile(f_classif, percentile=percentile)
    selector.fit(features_train_transformed, labels_train)
    features_train_transformed = selector.transform(features_train_transformed).toarray()
    features_test_transformed  = selector.transform(features_test_transformed).toarray()

    ### info on the data
    print "no. of Chris training emails:", sum(labels_train)
    print "no. of Sara training emails:", len(labels_train)-sum(labels_train)
    
    return features_train_transformed, features_test_transformed, labels_train, labels_test

Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0

4 votes

def call_GridParamSearch_featfilt(X, y) :
    '''
        (def is Currently just a cut & paste from "main".)
        Calles def GridParamSearch , (which uses randomized CV to find odel param)
    Used to try different ml models, then get their optimal paramters
    '''
    print("SPARSE (L1) EXT gridparam scores:")
    #   clf = Pipeline([
    #       ('feature_selection', LinearSVC(penalty="l1", loss='l1',dual=False, class_weight='auto')),
    # ('classification', ExtraTreesClassifier(n_jobs=3)
    #   )])
    'Sparse; L1 penalized features selection prior to RF fitting/prediction'
    clf_svm = LinearSVC(penalty="l1", loss='l2', dual=False, class_weight='auto')
    clf_logit = LogisticRegression(penalty="l1", dual=False, class_weight='auto')

    'http://scikit-learn.org/0.13/auto_examples/plot_feature_selection.html'
    print('Original features matrix:')
    print(X.shape)
    # Univariate feature selection with F-test for feature scoring
    # We use the default selection function: the 20% most significant features
    # selector = SelectPercentile(f_classif, percentile=20)
    selector = SelectPercentile(chi2, percentile=20)
    X_anova = selector.fit_transform(X, y)
    print(
        'New (2 f_classif) Using statistical feature selection: features matrix is:')
    print(X_anova.shape)

    # lda = LDA(n_components=10)
    # X_lda = lda.fit_transform(X, y)
    # print('New LDA filtered features matrix:')
    # print(X_lda.shape)

    X_svm = clf_svm.fit_transform(X, y)  #Get Sparse feature selections..
    # print(clf.feature_importances_ )
    print('New sparse (SVM filtered) features matrix:')
    print(X_svm.shape)

    print("Res of SVM fitting of (F scores filtered =2) for more feature selection:")
    X_doubleFilt_svm_f = clf_svm.fit_transform(X_anova, y)
    print(X_doubleFilt_svm_f.shape)
    print("param search on sparse features matrix")
    GridParamSearch(param_dist=Tree_param_dist, clf=clf_EXT, X=X_svm, y=y)

Source File: feature_selection.py From ecg-classification with GNU General Public License v3.0

4 votes

def run_feature_selection(features, labels, feature_selection, best_features):
    
    if feature_selection == 'select_K_Best':
        # feature extraction
        selector = SelectKBest(score_func=f_classif, k=4) # score_func=chi2 : only for non-negative features
        selector.fit(features, labels)
        # summarize scores
        scores = selector.scores_
        features_index_sorted = np.argsort(-scores)
        features_selected = features[:, features_index_sorted[0:best_features]]

    # SelectFromModel and LassoCV
    
    # We use the base estimator LassoCV since the L1 norm promotes sparsity of features.
    if feature_selection == 'LassoCV':
        clf = LassoCV()

        # Set a minimum threshold of 0.25
        sfm = SelectFromModel(clf, threshold=0.95)
        sfm.fit(features, labels)
        features_selected = sfm.transform(features).shape[1]

        """
        # Reset the threshold till the number of features equals two.
        # Note that the attribute can be set directly instead of repeatedly
        # fitting the metatransformer.
        while n_features > 2:
            sfm.threshold += 0.1
            X_transform = sfm.transform(X)
            n_features = X_transform.shape[1]
        """

    # Univariate feature selection
    # Univariate feature selection works by selecting the best features based on univariate statistical tests. 
    # It can be seen as a preprocessing step to an estimator. 
    # Scikit-learn exposes feature selection routines as objects that implement the transform method:
    #   - SelectKBest removes all but the k highest scoring features
    #   - SelectPercentile removes all but a user-specified highest scoring percentage of features
    #       common univariate statistical tests for each feature: false positive rate SelectFpr, false discovery rate SelectFdr, or family wise error SelectFwe.
    #   - GenericUnivariateSelect allows to perform univariate feature selection with a configurable strategy. This allows to select the best univariate selection strategy with hyper-parameter search estimator.

    if feature_selection == 'slct_percentile':
        selector = SelectPercentile(f_classif, percentile=10)
        selector.fit(features, labels)
        # The percentile not affect. 
        # Just select in order the top features by number or threshold

        # Keep best 8 values?
        scores = selector.scores_
        features_index_sorted = np.argsort(-scores)
        # scores = selector.scores_

        # scores = -np.log10(selector.pvalues_)
        # scores /= scores.max()

        features_selected = features[:, features_index_sorted[0:best_features]]

    print("Selected only " + str(features_selected.shape) + " features ")

    return features_selected, features_index_sorted

Python sklearn.feature_selection.SelectPercentile() Examples