Python sklearn.feature_selection.f_classif() Examples
The following are 30
code examples of sklearn.feature_selection.f_classif().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_selection
, or try the search function
.
Example #1
Source File: GetMLPara.py From dr_droid with Apache License 2.0 | 6 votes |
def find_best_feature_selections(X,y): #select the best features usin different technique X_new = SelectKBest(chi2, k=80).fit_transform(X,y) X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y) X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y) X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y) X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y) print (X_new.shape) #selection_parameters_for_classfier(X_new,y) #print (y.shape) train_and_test(X_new,y) train_and_test(X_new1,y) train_and_test(X_new2,y) train_and_test(X_new22,y) train_and_test(X_new3,y) train_and_test(X_new4,y) #X,y = _dataset_sample() ################################PARAMETER Selected################################ #TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest
Example #2
Source File: FeatureSelector.py From FAE with GNU General Public License v3.0 | 6 votes |
def GetSelectedFeatureIndex(self, data_container): data = data_container.GetArray() data /= np.linalg.norm(data, ord=2, axis=0) label = data_container.GetLabel() if data.shape[1] < self.GetSelectedFeatureNumber(): print( 'ANOVA: The number of features {:d} in data container is smaller than the required number {:d}'.format( data.shape[1], self.GetSelectedFeatureNumber())) self.SetSelectedFeatureNumber(data.shape[1]) fs = SelectKBest(f_classif, k=self.GetSelectedFeatureNumber()) fs.fit(data, label) feature_index = fs.get_support(True) f_value, p_value = f_classif(data, label) return feature_index.tolist(), f_value, p_value
Example #3
Source File: test_dft.py From pyts with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _compute_expected_results(X, y=None, n_coefs=None, drop_sum=False, anova=False, norm_mean=False, norm_std=False): """Compute the expected results.""" X = np.asarray(X) if norm_mean: X -= X.mean(axis=1)[:, None] if norm_std: X /= X.std(axis=1)[:, None] X_fft = np.fft.rfft(X) X_fft = np.vstack([np.real(X_fft), np.imag(X_fft)]) X_fft = X_fft.reshape(n_samples, -1, order='F') if drop_sum: X_fft = X_fft[:, 2:-1] else: X_fft = np.hstack([X_fft[:, :1], X_fft[:, 2:-1]]) if n_coefs is None: return X_fft else: if anova: _, p = f_classif(X_fft, y) support = np.argsort(p)[:n_coefs] return X_fft[:, support] else: return X_fft[:, :n_coefs]
Example #4
Source File: test_base.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_clone(): # Tests that clone creates a correct deep copy. # We create an estimator, make a copy of its original state # (which, in this case, is the current state of the estimator), # and check that the obtained copy is a correct deep copy. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert selector is not new_selector assert_equal(selector.get_params(), new_selector.get_params()) selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) new_selector = clone(selector) assert selector is not new_selector
Example #5
Source File: test_base.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_clone(): # Tests that clone creates a correct deep copy. # We create an estimator, make a copy of its original state # (which, in this case, is the current state of the estimator), # and check that the obtained copy is a correct deep copy. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) new_selector = clone(selector) assert_true(selector is not new_selector) assert_equal(selector.get_params(), new_selector.get_params()) selector = SelectFpr(f_classif, alpha=np.zeros((10, 2))) new_selector = clone(selector) assert_true(selector is not new_selector)
Example #6
Source File: dft.py From pyts with BSD 3-Clause "New" or "Revised" License | 6 votes |
def _anova(self, X_fft, y, n_coefs, n_timestamps): if n_coefs < X_fft.shape[1]: non_constant = np.where( ~np.isclose(X_fft.var(axis=0), np.zeros_like(X_fft.shape[1])) )[0] if non_constant.size == 0: raise ValueError("All the Fourier coefficients are constant. " "Your input data is weirdly homogeneous.") elif non_constant.size < n_coefs: warn("The number of non constant Fourier coefficients ({0}) " "is lower than the number of coefficients to keep ({1}). " "The number of coefficients to keep is truncated to {2}" ".".format(non_constant.size, n_coefs, non_constant.size)) support = non_constant else: _, p = f_classif(X_fft[:, non_constant], y) support = non_constant[np.argsort(p)[:n_coefs]] else: support = np.arange(n_coefs) return support
Example #7
Source File: f_score.py From scikit-feature with GNU General Public License v2.0 | 6 votes |
def f_score(X, y): """ This function implements the anova f_value feature selection (existing method for classification in scikit-learn), where f_score = sum((ni/(c-1))*(mean_i - mean)^2)/((1/(n - c))*sum((ni-1)*std_i^2)) Input ----- X: {numpy array}, shape (n_samples, n_features) input data y : {numpy array},shape (n_samples,) input class labels Output ------ F: {numpy array}, shape (n_features,) f-score for each feature """ F, pval = f_classif(X, y) return F
Example #8
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 6 votes |
def test_export_pipeline(): """Assert that exported_pipeline() generated a compile source file as expected given a fixed pipeline.""" pipeline_string = ( 'KNeighborsClassifier(CombineDFs(' 'DecisionTreeClassifier(input_matrix, DecisionTreeClassifier__criterion=gini, ' 'DecisionTreeClassifier__max_depth=8,DecisionTreeClassifier__min_samples_leaf=5,' 'DecisionTreeClassifier__min_samples_split=5),SelectPercentile(input_matrix, SelectPercentile__percentile=20))' 'KNeighborsClassifier__n_neighbors=10, ' 'KNeighborsClassifier__p=1,KNeighborsClassifier__weights=uniform' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline, make_union from sklearn.tree import DecisionTreeClassifier from tpot.builtins import StackingEstimator # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) exported_pipeline = make_pipeline( make_union( StackingEstimator(estimator=DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5)), SelectPercentile(score_func=f_classif, percentile=20) ), KNeighborsClassifier(n_neighbors=10, p=1, weights="uniform") ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert expected_code == export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset)
Example #9
Source File: dominance.py From dominance-analysis with MIT License | 6 votes |
def get_top_k(self): columns=list(self.data.columns.values) columns.remove(self.target) # remove intercept from top_k if(self.objective): top_k_vars=SelectKBest(f_regression, k=self.top_k) top_k_vars.fit_transform(self.data[columns], self.data[self.target]) else: columns.remove('intercept') try: top_k_vars=SelectKBest(chi2, k=self.top_k) top_k_vars.fit_transform(self.data[columns], self.data[self.target]) except: top_k_vars=SelectKBest(f_classif, k=self.top_k) top_k_vars.fit_transform(self.data[columns], self.data[self.target]) return [columns[i] for i in top_k_vars.get_support(indices=True)]
Example #10
Source File: scores.py From SecuML with GNU General Public License v2.0 | 5 votes |
def compute_scoring_func(self, func): if func == 'variance': features = self.instances.features.get_values() annotations = self.instances.annotations.get_labels() if isinstance(features, spmatrix): variance = mean_variance_axis(features, axis=0)[1] else: variance = features.var(axis=0) return variance, None features = self.annotated_instances.features.get_values() annotations = self.annotated_instances.annotations.get_supervision( self.multiclass) if func == 'f_classif': return f_classif(features, annotations) elif func == 'mutual_info_classif': if isinstance(features, spmatrix): discrete_indexes = True else: features_types = self.instances.features.info.types discrete_indexes = [i for i, t in enumerate(features_types) if t == FeatureType.binary] if not discrete_indexes: discrete_indexes = False return (mutual_info_classif(features, annotations, discrete_features=discrete_indexes), None) elif func == 'chi2': return chi2(features, annotations) else: assert(False)
Example #11
Source File: GetMLPara.py From dr_droid with Apache License 2.0 | 5 votes |
def my_get_fp_fn_CV(X_original,y): #generate classfiers knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') #decision tree dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best') #naive #nbbern = BernoulliNB() #random forest rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) #svm svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) #reduce the size #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y) skb = SelectKBest(f_classif, k=80).fit(X_original,y) X = skb.fit_transform(X_original,y) print ("KNN") my_get_fp_fn_inter(knn,X,y) print ("DTree") my_get_fp_fn_inter(dtree,X,y) print ("rforest") my_get_fp_fn_inter(rforest,X,y) #print ("naive bayes") #my_get_fp_fn_inter(nbbern,X,y) print ("SVMrbf") my_get_fp_fn_inter(svmrbf,X,y)
Example #12
Source File: GetMLPara.py From dr_droid with Apache License 2.0 | 5 votes |
def precision_recall_curve_draw(X_o,y): X = SelectKBest(f_classif, k=80).fit_transform(X_o,y) print (X.shape) print (y.shape) svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best') rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) p_svmrbf, r_svmrbf, auc_svmrbf = get_my_pecision_recall(svmrbf,X,y) p_knn, r_knn, auc_knn = get_my_pecision_recall(knn, X, y) p_dtree, r_dtree, auc_dtree = get_my_pecision_recall(dtree, X, y) p_rforest, r_rforest, auc_rforest = get_my_pecision_recall(rforest, X, y) plt.clf() plt.plot(r_svmrbf,p_svmrbf, 'y.--', label ='SVM auc=%0.3f'% auc_svmrbf) plt.plot(r_knn, p_knn, 'r^--', label='KNN auc=%0.3f' %auc_knn) plt.plot(r_dtree, p_dtree, 'b>--', label ='Decision Tree auc=%0.3f'% auc_dtree) plt.plot(r_rforest, p_rforest, 'go--', label ='Random Forest auc=%0.3f'% auc_rforest) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('recall rate') plt.ylabel('precision rate') plt.title('precision-recall curve') plt.legend(loc="lower right") plt.show() del X del y ###############################################Examples to show the difference of features representation ##################################
Example #13
Source File: utilanalisis.py From pghumor with Apache License 2.0 | 5 votes |
def f_score_feature_selection(features, clases, nombres_features_ordenadas): print("Realizando f-score feature selection") f_score = feature_selection.f_classif(features, clases) imprimir_importancias(f_score, "f-score", nombres_features_ordenadas)
Example #14
Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_f_classif(self): diabetes = datasets.load_diabetes() df = pdml.ModelFrame(diabetes) result = df.feature_selection.f_classif() expected = fs.f_classif(diabetes.data, diabetes.target) self.assertEqual(len(result), 2) tm.assert_numpy_array_equal(result[0], expected[0]) tm.assert_numpy_array_equal(result[1], expected[1])
Example #15
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 5 votes |
def PlotPerfPercentFeatures(X,y,est=LinearSVC()): ''' Performance of a classifier (default: SVM-Anova) varying the percentile of features selected (F-test) . http://scikit-learn.org/stable/auto_examples/svm/plot_svm_anova.html#example-svm-plot-svm-anova-py See Also: (Similar but with model seelction from among classifiers): http://nbviewer.ipython.org/github/bugra/pydata-nyc-2014/blob/master/6.%20Scikit%20Learn%20-%20Model%20Selection.ipynb ''' transform = SelectPercentile(f_classif) clf = Pipeline([('anova', transform), ('est', est)]) ############################################################################### # Plot the cross-validation score as a function of percentile of features score_means = list() score_stds = list() percentiles = (1,2,3,5,7,10,13,15,20,25,33,50,65,75,90, 99) # percentiles = (1,5,10,25,50,75,90) for percentile in percentiles: # print(percentile) clf.set_params(anova__percentile=percentile) this_scores = cross_val_score(clf, X, y,cv=StratifiedShuffleSplit(y, n_iter=7, test_size=0.3), n_jobs=-1) score_means.append(this_scores.mean()) score_stds.append(this_scores.std()) print("Outputting Graph:") plt.errorbar(percentiles, score_means, np.array(score_stds)) plt.title( 'Predictor Performance, varying percent of features used') plt.xlabel('Percentile') plt.ylabel('Prediction Performance') plt.axis('tight') plt.show()
Example #16
Source File: preprocessing.py From sk-dist with Apache License 2.0 | 5 votes |
def __init__(self, selector="fpr", score_func=feature_selection.f_classif, threshold=0.05): self.selector = selector self.score_func = score_func self.threshold = threshold
Example #17
Source File: scores.py From SecuML with GNU General Public License v2.0 | 5 votes |
def _set_scoring_func(self): self.scoring_func = [('variance', False)] if self.annotated_instances.num_instances() > 0: self.scoring_func.append(('f_classif', True)) self.scoring_func.append(('mutual_info_classif', False)) if self.instances.features.all_positives(): self.scoring_func.append(('chi2', True))
Example #18
Source File: relation.py From visualize_ML with MIT License | 5 votes |
def evaluate_anova(x,y): F_value,pvalue = f_classif(x,y) return F_value,pvalue # In descriptive statistics, a box plot or boxplot is a convenient way of graphically depicting groups of numerical data through their quartiles. Box plots may also have lines extending vertically from the boxes (whiskers) indicating variability outside the upper and lower quartiles, hence the terms box-and-whisker plot and box-and-whisker diagram. # Quartile: In descriptive statistics, the quartiles of a ranked set of data values are the three points that divide the data set into four equal groups, each group comprising a quarter of the data
Example #19
Source File: test_pipeline.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
Example #20
Source File: test_base.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_clone_2(): # Tests that clone doesn't copy everything. # We first create an estimator, give it an own attribute, and # make a copy of its original state. Then we check that the copy doesn't # have the specific attribute we manually added to the initial estimator. from sklearn.feature_selection import SelectFpr, f_classif selector = SelectFpr(f_classif, alpha=0.1) selector.own_attribute = "test" new_selector = clone(selector) assert_false(hasattr(new_selector, "own_attribute"))
Example #21
Source File: test_randomized_l1.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_randomized_logistic(): # Check randomized sparse logistic regression iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) X_orig = X.copy() feature_scores = clf.fit(X, y).scores_ assert_array_equal(X, X_orig) # fit does not modify X assert_array_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[1., 0.5], random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ assert_array_equal(np.argsort(F), np.argsort(feature_scores)) clf = RandomizedLogisticRegression(verbose=False, C=[[1., 0.5]]) assert_raises(ValueError, clf.fit, X, y)
Example #22
Source File: test_randomized_l1.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_randomized_logistic_sparse(): # Check randomized sparse logistic regression on sparse data iris = load_iris() X = iris.data[:, [0, 2]] y = iris.target X = X[y != 2] y = y[y != 2] # center here because sparse matrices are usually not centered # labels should not be centered X, _, _, _, _ = _preprocess_data(X, y, True, True) X_sp = sparse.csr_matrix(X) F, _ = f_classif(X, y) scaling = 0.3 clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores = clf.fit(X, y).scores_ clf = RandomizedLogisticRegression(verbose=False, C=1., random_state=42, scaling=scaling, n_resampling=50, tol=1e-3) feature_scores_sp = clf.fit(X_sp, y).scores_ assert_array_equal(feature_scores, feature_scores_sp)
Example #23
Source File: feature_selection.py From dataiku-contrib with Apache License 2.0 | 5 votes |
def univariate_feature_selection(mode,predictors,target): if mode == 'f_regression': fselect = SelectPercentile(f_regression, 100) if mode == 'f_classif': fselect = SelectPercentile(f_classif, 100) if mode == 'chi2': fselect = SelectPercentile(chi2, 100) fselect.fit_transform(predictors, target) return fselect.pvalues_
Example #24
Source File: models.py From aletheia with MIT License | 5 votes |
def fit(self, X, y): self.selector = SelectKBest(f_classif, k=self.max_features) self.selector.fit(X, y) X_train=self.selector.transform(X) y_train=y param_list=[] idx = range(len(y_train)) for i in range(self.n_estimators): random.shuffle(idx) param_list.append((X_train[idx[:self.max_samples]], y_train[idx[:self.max_samples]])) pool = ThreadPool(cpu_count()) self.clf_list = pool.map(self._prepare_classifier, param_list) pool.close() pool.join() """ X2=[] for clf in self.clf_list: P=clf.predict_proba(X_train) if len(X2)==0: X2=P[:, 0] else: X2=numpy.vstack((X2, P[:, 0])) X2=numpy.swapaxes(X2, 0, 1) print "X2:", X2.shape from sklearn.ensemble import RandomForestClassifier self.clf2=RandomForestClassifier(n_estimators=100) self.clf2.fit(X2, y_train) """
Example #25
Source File: test_sfa.py From pyts with BSD 3-Clause "New" or "Revised" License | 5 votes |
def _compute_expected_results(X, y=None, n_coefs=None, n_bins=4, strategy='quantile', drop_sum=False, anova=False, norm_mean=False, norm_std=False, alphabet=None): """Compute the expected results.""" X = np.asarray(X) if norm_mean: X -= X.mean(axis=1)[:, None] if norm_std: X /= X.std(axis=1)[:, None] X_fft = np.fft.rfft(X) X_fft = np.vstack([np.real(X_fft), np.imag(X_fft)]) X_fft = X_fft.reshape(n_samples, -1, order='F') if drop_sum: X_fft = X_fft[:, 2:-1] else: X_fft = np.hstack([X_fft[:, :1], X_fft[:, 2:-1]]) if n_coefs is not None: if anova: _, p = f_classif(X_fft, y) support = np.argsort(p)[:n_coefs] X_fft = X_fft[:, support] else: X_fft = X_fft[:, :n_coefs] mcb = MultipleCoefficientBinning(n_bins=n_bins, strategy=strategy, alphabet=alphabet) arr_desired = mcb.fit_transform(X_fft) return arr_desired
Example #26
Source File: FieldSelector.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def __init__(self, options): self.handle_options(options) out_params = convert_params( options.get('params', {}), floats=['param'], strs=['type', 'mode'], aliases={'type': 'score_func'}, ) if 'score_func' not in out_params: out_params['score_func'] = f_classif else: if out_params['score_func'].lower() == 'categorical': out_params['score_func'] = f_classif elif out_params['score_func'].lower() in ['numerical', 'numeric']: out_params['score_func'] = f_regression else: raise RuntimeError('type can either be categorical or numeric.') if 'mode' in out_params: if out_params['mode'] not in ('k_best', 'fpr', 'fdr', 'fwe', 'percentile'): raise RuntimeError('mode can only be one of the following: fdr, fpr, fwe, k_best, and percentile') if out_params['mode'] in ['fpr', 'fdr', 'fwe']: if 'param' in out_params: if not 0 < out_params['param'] < 1: msg = 'Invalid param value for mode {}: param must be between 0 and 1.'.format(out_params['mode']) raise ValueError(msg) # k_best and percentile require integer param if 'param' in out_params and out_params.get('mode') not in ['fdr', 'fpr', 'fwe']: original_value = out_params['param'] out_params['param'] = int(out_params['param']) if out_params['param'] != original_value: msg = 'param value {} is not an integer; mode={} requires an integer.' msg = msg.format(original_value, out_params.get('mode', 'percentile')) raise ValueError(msg) self.estimator = GenericUnivariateSelect(**out_params)
Example #27
Source File: FieldSelector.py From Splunking-Crime with GNU Affero General Public License v3.0 | 5 votes |
def decode(cls, obj): from sklearn.feature_selection import f_classif, f_regression, GenericUnivariateSelect new_obj = GenericUnivariateSelect.__new__(GenericUnivariateSelect) new_obj.__dict__ = obj['dict'] if new_obj.score_func == 'f_classif': new_obj.score_func = f_classif elif new_obj.score_func == 'f_regression': new_obj.score_func = f_regression else: raise ValueError('Unsupported GenericUnivariateSelect.score_func "%s"' % new_obj.score_func) return new_obj
Example #28
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 5 votes |
def test_pipeline_score_save(): """Assert that the TPOTClassifier can generate a scored pipeline export correctly.""" tpot_obj = TPOTClassifier() tpot_obj._fit_init() tpot_obj._pbar = tqdm(total=1, disable=True) pipeline_string = ( 'DecisionTreeClassifier(SelectPercentile(input_matrix, SelectPercentile__percentile=20),' 'DecisionTreeClassifier__criterion=gini, DecisionTreeClassifier__max_depth=8,' 'DecisionTreeClassifier__min_samples_leaf=5, DecisionTreeClassifier__min_samples_split=5)' ) pipeline = creator.Individual.from_string(pipeline_string, tpot_obj._pset) expected_code = """import numpy as np import pandas as pd from sklearn.feature_selection import SelectPercentile, f_classif from sklearn.model_selection import train_test_split from sklearn.pipeline import make_pipeline from sklearn.tree import DecisionTreeClassifier # NOTE: Make sure that the outcome column is labeled 'target' in the data file tpot_data = pd.read_csv('PATH/TO/DATA/FILE', sep='COLUMN_SEPARATOR', dtype=np.float64) features = tpot_data.drop('target', axis=1) training_features, testing_features, training_target, testing_target = \\ train_test_split(features, tpot_data['target'], random_state=None) # Average CV score on the training set was: 0.929813743 exported_pipeline = make_pipeline( SelectPercentile(score_func=f_classif, percentile=20), DecisionTreeClassifier(criterion="gini", max_depth=8, min_samples_leaf=5, min_samples_split=5) ) exported_pipeline.fit(training_features, training_target) results = exported_pipeline.predict(testing_features) """ assert_equal(expected_code, export_pipeline(pipeline, tpot_obj.operators, tpot_obj._pset, pipeline_score=0.929813743))
Example #29
Source File: export_tests.py From tpot with GNU Lesser General Public License v3.0 | 5 votes |
def test_operator_export(): """Assert that a TPOT operator can export properly with a callable function as a parameter.""" assert list(TPOTSelectPercentile.arg_types) == TPOTSelectPercentile_args export_string = TPOTSelectPercentile.export(5) assert export_string == "SelectPercentile(score_func=f_classif, percentile=5)"
Example #30
Source File: test_pipeline.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)