Python Examples of sklearn.feature_selection.f

Source File: GetMLPara.py From dr_droid with Apache License 2.0

6 votes

def find_best_feature_selections(X,y):

    #select the best features usin different technique
    X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
    X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)

    X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
    X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)

    X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
    X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)

    print (X_new.shape)
    #selection_parameters_for_classfier(X_new,y)
    #print (y.shape)
    train_and_test(X_new,y)
    train_and_test(X_new1,y)
    train_and_test(X_new2,y)
    train_and_test(X_new22,y)
    train_and_test(X_new3,y)
    train_and_test(X_new4,y)
    #X,y = _dataset_sample()

################################PARAMETER  Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest

Source File: FeatureSelector.py From FAE with GNU General Public License v3.0

6 votes

def GetSelectedFeatureIndex(self, data_container):
        data = data_container.GetArray()
        data /= np.linalg.norm(data, ord=2, axis=0)
        label = data_container.GetLabel()

        if data.shape[1] < self.GetSelectedFeatureNumber():
            print(
                'ANOVA: The number of features {:d} in data container is smaller than the required number {:d}'.format(
                    data.shape[1], self.GetSelectedFeatureNumber()))
            self.SetSelectedFeatureNumber(data.shape[1])

        fs = SelectKBest(f_classif, k=self.GetSelectedFeatureNumber())
        fs.fit(data, label)
        feature_index = fs.get_support(True)
        f_value, p_value = f_classif(data, label)
        return feature_index.tolist(), f_value, p_value

Source File: test_dft.py From pyts with BSD 3-Clause "New" or "Revised" License

6 votes

def _compute_expected_results(X, y=None, n_coefs=None, drop_sum=False,
                              anova=False, norm_mean=False, norm_std=False):
    """Compute the expected results."""
    X = np.asarray(X)
    if norm_mean:
        X -= X.mean(axis=1)[:, None]
    if norm_std:
        X /= X.std(axis=1)[:, None]
    X_fft = np.fft.rfft(X)
    X_fft = np.vstack([np.real(X_fft), np.imag(X_fft)])
    X_fft = X_fft.reshape(n_samples, -1, order='F')
    if drop_sum:
        X_fft = X_fft[:, 2:-1]
    else:
        X_fft = np.hstack([X_fft[:, :1], X_fft[:, 2:-1]])
    if n_coefs is None:
        return X_fft
    else:
        if anova:
            _, p = f_classif(X_fft, y)
            support = np.argsort(p)[:n_coefs]
            return X_fft[:, support]
        else:
            return X_fft[:, :n_coefs]

Source File: test_base.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_clone():
    # Tests that clone creates a correct deep copy.
    # We create an estimator, make a copy of its original state
    # (which, in this case, is the current state of the estimator),
    # and check that the obtained copy is a correct deep copy.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert selector is not new_selector
    assert_equal(selector.get_params(), new_selector.get_params())

    selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
    new_selector = clone(selector)
    assert selector is not new_selector

Source File: test_base.py From twitter-stock-recommendation with MIT License

6 votes

def test_clone():
    # Tests that clone creates a correct deep copy.
    # We create an estimator, make a copy of its original state
    # (which, in this case, is the current state of the estimator),
    # and check that the obtained copy is a correct deep copy.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    new_selector = clone(selector)
    assert_true(selector is not new_selector)
    assert_equal(selector.get_params(), new_selector.get_params())

    selector = SelectFpr(f_classif, alpha=np.zeros((10, 2)))
    new_selector = clone(selector)
    assert_true(selector is not new_selector)

Source File: dft.py From pyts with BSD 3-Clause "New" or "Revised" License

6 votes

def _anova(self, X_fft, y, n_coefs, n_timestamps):
        if n_coefs < X_fft.shape[1]:
            non_constant = np.where(
                ~np.isclose(X_fft.var(axis=0), np.zeros_like(X_fft.shape[1]))
            )[0]
            if non_constant.size == 0:
                raise ValueError("All the Fourier coefficients are constant. "
                                 "Your input data is weirdly homogeneous.")
            elif non_constant.size < n_coefs:
                warn("The number of non constant Fourier coefficients ({0}) "
                     "is lower than the number of coefficients to keep ({1}). "
                     "The number of coefficients to keep is truncated to {2}"
                     ".".format(non_constant.size, n_coefs, non_constant.size))
                support = non_constant
            else:
                _, p = f_classif(X_fft[:, non_constant], y)
                support = non_constant[np.argsort(p)[:n_coefs]]
        else:
            support = np.arange(n_coefs)
        return support

Source File: f_score.py From scikit-feature with GNU General Public License v2.0

6 votes

def f_score(X, y):
    """
    This function implements the anova f_value feature selection (existing method for classification in scikit-learn),
    where f_score = sum((ni/(c-1))*(mean_i - mean)^2)/((1/(n - c))*sum((ni-1)*std_i^2))

    Input
    -----
    X: {numpy array}, shape (n_samples, n_features)
        input data
    y : {numpy array},shape (n_samples,)
        input class labels

    Output
    ------
    F: {numpy array}, shape (n_features,)
        f-score for each feature
    """

    F, pval = f_classif(X, y)
    return F

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

6 votes

def test_export_pipeline():
    """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline."""

    pipeline_string = (
        'KNeighborsClassifier(CombineDFs('
        'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, '
        'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,'
        'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))'
        'KNeighborsClassifier__n_neighbors=10, '
        'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform'
    )

    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.neighbors import KNeighborsClassifier
from sklearn.pipeline import make_pipeline, make_union
from sklearn.tree import DecisionTreeClassifier
from tpot.builtins import StackingEstimator

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'], random_state=None)

exported_pipeline = make_pipeline(
    make_union(
        StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)),
        SelectPercentile(score_func=f_classif, percentile=20)
    ),
    KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform")
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)

Source File: dominance.py From dominance-analysis with MIT License

6 votes

def get_top_k(self):
		columns=list(self.data.columns.values)
		columns.remove(self.target)
		# remove intercept from top_k
		if(self.objective):
			top_k_vars=SelectKBest(f_regression, k=self.top_k)
			top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		else:
			columns.remove('intercept')
			try:
				top_k_vars=SelectKBest(chi2, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
			except:
				top_k_vars=SelectKBest(f_classif, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		return [columns[i] for i in top_k_vars.get_support(indices=True)]

Source File: scores.py From SecuML with GNU General Public License v2.0

5 votes

def compute_scoring_func(self, func):
        if func == 'variance':
            features = self.instances.features.get_values()
            annotations = self.instances.annotations.get_labels()
            if isinstance(features, spmatrix):
                variance = mean_variance_axis(features, axis=0)[1]
            else:
                variance = features.var(axis=0)
            return variance, None

        features = self.annotated_instances.features.get_values()
        annotations = self.annotated_instances.annotations.get_supervision(
                                                               self.multiclass)
        if func == 'f_classif':
            return f_classif(features, annotations)
        elif func == 'mutual_info_classif':
            if isinstance(features, spmatrix):
                discrete_indexes = True
            else:
                features_types = self.instances.features.info.types
                discrete_indexes = [i for i, t in enumerate(features_types)
                                    if t == FeatureType.binary]
                if not discrete_indexes:
                    discrete_indexes = False
            return (mutual_info_classif(features, annotations,
                                        discrete_features=discrete_indexes),
                    None)
        elif func == 'chi2':
            return chi2(features, annotations)
        else:
            assert(False)

Source File: GetMLPara.py From dr_droid with Apache License 2.0

5 votes

def my_get_fp_fn_CV(X_original,y):

    #generate classfiers
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    #decision tree
    dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best')

    #naive
    #nbbern = BernoulliNB()

    #random forest
    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',  min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)

    #svm
    svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)

    #reduce the size
    #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y)
    skb = SelectKBest(f_classif, k=80).fit(X_original,y)
    X = skb.fit_transform(X_original,y)

    print ("KNN")
    my_get_fp_fn_inter(knn,X,y)
    print ("DTree")
    my_get_fp_fn_inter(dtree,X,y)
    print ("rforest")
    my_get_fp_fn_inter(rforest,X,y)
    #print ("naive bayes")
    #my_get_fp_fn_inter(nbbern,X,y)
    print ("SVMrbf")
    my_get_fp_fn_inter(svmrbf,X,y)

Source File: GetMLPara.py From dr_droid with Apache License 2.0

5 votes

def precision_recall_curve_draw(X_o,y):

    X = SelectKBest(f_classif, k=80).fit_transform(X_o,y)
    print (X.shape)
    print (y.shape)

    svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best')

    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',  min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)

    p_svmrbf, r_svmrbf, auc_svmrbf = get_my_pecision_recall(svmrbf,X,y)


    p_knn, r_knn, auc_knn = get_my_pecision_recall(knn, X, y)
    p_dtree, r_dtree, auc_dtree = get_my_pecision_recall(dtree, X, y)
    p_rforest, r_rforest, auc_rforest = get_my_pecision_recall(rforest, X, y)

    plt.clf()
    plt.plot(r_svmrbf,p_svmrbf, 'y.--', label ='SVM auc=%0.3f'% auc_svmrbf)
    plt.plot(r_knn, p_knn, 'r^--', label='KNN auc=%0.3f' %auc_knn)
    plt.plot(r_dtree, p_dtree, 'b>--', label ='Decision Tree auc=%0.3f'% auc_dtree)
    plt.plot(r_rforest, p_rforest, 'go--', label ='Random Forest auc=%0.3f'% auc_rforest)

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('recall rate')
    plt.ylabel('precision rate')
    plt.title('precision-recall curve')
    plt.legend(loc="lower right")
    plt.show()

    del X
    del y
###############################################Examples to show the difference of features representation ##################################

Source File: utilanalisis.py From pghumor with Apache License 2.0

5 votes

def f_score_feature_selection(features, clases, nombres_features_ordenadas):
    print("Realizando f-score feature selection")
    f_score = feature_selection.f_classif(features, clases)

    imprimir_importancias(f_score, "f-score", nombres_features_ordenadas)

Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_f_classif(self):
        diabetes = datasets.load_diabetes()
        df = pdml.ModelFrame(diabetes)

        result = df.feature_selection.f_classif()
        expected = fs.f_classif(diabetes.data, diabetes.target)

        self.assertEqual(len(result), 2)
        tm.assert_numpy_array_equal(result[0], expected[0])
        tm.assert_numpy_array_equal(result[1], expected[1])

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

5 votes

def PlotPerfPercentFeatures(X,y,est=LinearSVC()):
    '''
    Performance of a classifier (default: SVM-Anova)
    varying the percentile of features selected (F-test) .

    http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py

    See Also: (Similar but with model seelction from among classifiers):
    http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb

    '''
    transform = SelectPercentile(f_classif)

    clf = Pipeline([('anova', transform), ('est', est)])
    ###############################################################################
    # Plot the cross-validation score as a function of percentile of features
    score_means = list()
    score_stds = list()
    percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99)
    # percentiles = (1,5,10,25,50,75,90)

    for percentile in percentiles:
        # print(percentile)
        clf.set_params(anova__percentile=percentile)
        this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1)
        score_means.append(this_scores.mean())
        score_stds.append(this_scores.std())
    print("Outputting Graph:")

    plt.errorbar(percentiles, score_means, np.array(score_stds))

    plt.title(
        'Predictor Performance, varying percent of features used')
    plt.xlabel('Percentile')
    plt.ylabel('Prediction Performance')
    plt.axis('tight')
    plt.show()

Source File: preprocessing.py From sk-dist with Apache License 2.0

5 votes

def __init__(self, selector="fpr", score_func=feature_selection.f_classif, threshold=0.05):
        self.selector = selector
        self.score_func = score_func
        self.threshold = threshold

Source File: scores.py From SecuML with GNU General Public License v2.0

5 votes

def _set_scoring_func(self):
        self.scoring_func = [('variance', False)]
        if self.annotated_instances.num_instances() > 0:
            self.scoring_func.append(('f_classif', True))
            self.scoring_func.append(('mutual_info_classif', False))
            if self.instances.features.all_positives():
                self.scoring_func.append(('chi2', True))

Source File: relation.py From visualize_ML with MIT License

5 votes

def evaluate_anova(x,y):
    F_value,pvalue = f_classif(x,y)
    return F_value,pvalue

# In descriptive statistics, a box plot or boxplot is a convenient way of graphically depicting groups of numerical data through their quartiles. Box plots may also have lines extending vertically from the boxes (whiskers) indicating variability outside the upper and lower quartiles, hence the terms box-and-whisker plot and box-and-whisker diagram.
# Quartile: In descriptive statistics, the quartiles of a ranked set of data values are the three points that divide the data set into four equal groups, each group comprising a quarter of the data

Source File: test_pipeline.py From twitter-stock-recommendation with MIT License

5 votes

def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)

Source File: test_base.py From twitter-stock-recommendation with MIT License

5 votes

def test_clone_2():
    # Tests that clone doesn't copy everything.
    # We first create an estimator, give it an own attribute, and
    # make a copy of its original state. Then we check that the copy doesn't
    # have the specific attribute we manually added to the initial estimator.

    from sklearn.feature_selection import SelectFpr, f_classif

    selector = SelectFpr(f_classif, alpha=0.1)
    selector.own_attribute = "test"
    new_selector = clone(selector)
    assert_false(hasattr(new_selector, "own_attribute"))

Source File: test_randomized_l1.py From twitter-stock-recommendation with MIT License

5 votes

def test_randomized_logistic():
    # Check randomized sparse logistic regression
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    X_orig = X.copy()
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(X, X_orig)   # fit does not modify X
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5],
                                       random_state=42, scaling=scaling,
                                       n_resampling=50, tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    assert_array_equal(np.argsort(F), np.argsort(feature_scores))

    clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]])
    assert_raises(ValueError, clf.fit, X, y)

Source File: test_randomized_l1.py From twitter-stock-recommendation with MIT License

5 votes

def test_randomized_logistic_sparse():
    # Check randomized sparse logistic regression on sparse data
    iris = load_iris()
    X = iris.data[:, [0, 2]]
    y = iris.target
    X = X[y != 2]
    y = y[y != 2]

    # center here because sparse matrices are usually not centered
    # labels should not be centered
    X, _, _, _, _ = _preprocess_data(X, y, True, True)

    X_sp = sparse.csr_matrix(X)

    F, _ = f_classif(X, y)

    scaling = 0.3
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    feature_scores = clf.fit(X, y).scores_
    clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42,
                                       scaling=scaling, n_resampling=50,
                                       tol=1e-3)
    feature_scores_sp = clf.fit(X_sp, y).scores_
    assert_array_equal(feature_scores, feature_scores_sp)

Source File: feature_selection.py From dataiku-contrib with Apache License 2.0

5 votes

def univariate_feature_selection(mode,predictors,target):
    
    if mode == 'f_regression':
        fselect = SelectPercentile(f_regression, 100)
        
    if mode == 'f_classif':
        fselect = SelectPercentile(f_classif, 100)
        
    if mode == 'chi2':
        fselect = SelectPercentile(chi2, 100)
        
    fselect.fit_transform(predictors, target)
    
    return fselect.pvalues_

Source File: models.py From aletheia with MIT License

5 votes

def fit(self, X, y):
        
        self.selector = SelectKBest(f_classif, k=self.max_features)
        self.selector.fit(X, y)

        X_train=self.selector.transform(X)
        y_train=y

        param_list=[]
        idx = range(len(y_train))
        for i in range(self.n_estimators):
            random.shuffle(idx)
            param_list.append((X_train[idx[:self.max_samples]], 
                               y_train[idx[:self.max_samples]]))

        pool = ThreadPool(cpu_count())
        self.clf_list = pool.map(self._prepare_classifier, param_list)
        pool.close()
        pool.join()

        """
        X2=[]
        for clf in self.clf_list:
            P=clf.predict_proba(X_train)
            if len(X2)==0:
                X2=P[:, 0]
            else:
                X2=numpy.vstack((X2, P[:, 0]))
        X2=numpy.swapaxes(X2, 0, 1)
        print "X2:", X2.shape

        from sklearn.ensemble import RandomForestClassifier
        self.clf2=RandomForestClassifier(n_estimators=100)
        self.clf2.fit(X2, y_train)
        """

Source File: test_sfa.py From pyts with BSD 3-Clause "New" or "Revised" License

5 votes

def _compute_expected_results(X, y=None, n_coefs=None, n_bins=4,
                              strategy='quantile', drop_sum=False, anova=False,
                              norm_mean=False, norm_std=False, alphabet=None):
    """Compute the expected results."""
    X = np.asarray(X)
    if norm_mean:
        X -= X.mean(axis=1)[:, None]
    if norm_std:
        X /= X.std(axis=1)[:, None]
    X_fft = np.fft.rfft(X)
    X_fft = np.vstack([np.real(X_fft), np.imag(X_fft)])
    X_fft = X_fft.reshape(n_samples, -1, order='F')
    if drop_sum:
        X_fft = X_fft[:, 2:-1]
    else:
        X_fft = np.hstack([X_fft[:, :1], X_fft[:, 2:-1]])
    if n_coefs is not None:
        if anova:
            _, p = f_classif(X_fft, y)
            support = np.argsort(p)[:n_coefs]
            X_fft = X_fft[:, support]
        else:
            X_fft = X_fft[:, :n_coefs]

    mcb = MultipleCoefficientBinning(n_bins=n_bins, strategy=strategy,
                                     alphabet=alphabet)
    arr_desired = mcb.fit_transform(X_fft)
    return arr_desired

Source File: FieldSelector.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            floats=['param'],
            strs=['type', 'mode'],
            aliases={'type': 'score_func'},
        )

        if 'score_func' not in out_params:
            out_params['score_func'] = f_classif
        else:
            if out_params['score_func'].lower() == 'categorical':
                out_params['score_func'] = f_classif
            elif out_params['score_func'].lower() in ['numerical', 'numeric']:
                out_params['score_func'] = f_regression
            else:
                raise RuntimeError('type can either be categorical or numeric.')

        if 'mode' in out_params:
            if out_params['mode'] not in ('k_best', 'fpr', 'fdr', 'fwe', 'percentile'):
                raise RuntimeError('mode can only be one of the following: fdr, fpr, fwe, k_best, and percentile')

            if out_params['mode'] in ['fpr', 'fdr', 'fwe']:
                if 'param' in out_params:
                    if not 0 < out_params['param'] < 1:
                        msg = 'Invalid param value for mode {}: param must be between 0 and 1.'.format(out_params['mode'])
                        raise ValueError(msg)

        # k_best and percentile require integer param
        if 'param' in out_params and out_params.get('mode') not in ['fdr', 'fpr', 'fwe']:
            original_value = out_params['param']
            out_params['param'] = int(out_params['param'])
            if out_params['param'] != original_value:
                msg = 'param value {} is not an integer; mode={} requires an integer.'
                msg = msg.format(original_value, out_params.get('mode', 'percentile'))
                raise ValueError(msg)

        self.estimator = GenericUnivariateSelect(**out_params)

Source File: FieldSelector.py From Splunking-Crime with GNU Affero General Public License v3.0

5 votes

def decode(cls, obj):
        from sklearn.feature_selection import f_classif, f_regression, GenericUnivariateSelect

        new_obj = GenericUnivariateSelect.__new__(GenericUnivariateSelect)
        new_obj.__dict__ = obj['dict']

        if new_obj.score_func == 'f_classif':
            new_obj.score_func = f_classif
        elif new_obj.score_func == 'f_regression':
            new_obj.score_func = f_regression
        else:
            raise ValueError('Unsupported GenericUnivariateSelect.score_func "%s"' % new_obj.score_func)

        return new_obj

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

5 votes

def test_pipeline_score_save():
    """Assert that the TPOTClassifier can generate a scored pipeline export correctly."""
    tpot_obj = TPOTClassifier()
    tpot_obj._fit_init()
    tpot_obj._pbar = tqdm(total=1, disable=True)
    pipeline_string = (
        'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),'
        'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,'
        'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)'
    )
    pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset)
    expected_code = """import numpy as np
import pandas as pd
from sklearn.feature_selection import SelectPercentile, f_classif
from sklearn.model_selection import train_test_split
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

# NOTE: Make sure that the outcome column is labeled 'target' in the data file
tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64)
features = tpot_data.drop('target', axis=1)
training_features, testing_features, training_target, testing_target = \\
            train_test_split(features, tpot_data['target'], random_state=None)

# Average CV score on the training set was: 0.929813743
exported_pipeline = make_pipeline(
    SelectPercentile(score_func=f_classif, percentile=20),
    DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)
)

exported_pipeline.fit(training_features, training_target)
results = exported_pipeline.predict(testing_features)
"""
    assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))

Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0

5 votes

def test_operator_export():
    """Assert that a TPOT operator can export properly with a callable function as a parameter."""
    assert list(TPOTSelectPercentile.arg_types) == TPOTSelectPercentile_args
    export_string = TPOTSelectPercentile.export(5)
    assert export_string == "SelectPercentile(score_func=f_classif, percentile=5)"

Source File: test_pipeline.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)

Python sklearn.feature_selection.f_classif() Examples