Python sklearn.feature_selection.SelectKBest() Examples
The following are 30
code examples of sklearn.feature_selection.SelectKBest().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_selection
, or try the search function
.
Example #1
Source File: DataAnalysis.py From Predicting-Health-Insurance-Cost with BSD 3-Clause "New" or "Revised" License | 8 votes |
def featuresFromFeatureSelection(X,Y,columnNames): for f in columnNames: print(f) X_new_withfitTransform = SelectKBest(chi2, k=34).fit(X, Y) colors = getColorNames() counter = 0 scores = X_new_withfitTransform.scores_ scores_scaled = np.divide(scores, 1000) for score in scores_scaled: #if(score > 10): #print('Feature {:>34}'.format(columnNames[counter])) print('{:>34} '.format( score)) '''Plot a graph''' plt.bar(counter, score,color=colors[counter]) counter +=1 plt.ylabel('Scores(1k)') plt.title('Scores calculated by Chi-Square Test') plt.legend(columnNames, bbox_to_anchor=(0., 0.8, 1., .102), loc=3,ncol=5, mode="expand", borderaxespad=0.) plt.show() #print(feature_selection.chi2(X,Y))
Example #2
Source File: tester.py From Text-Classification-Benchmark with MIT License | 7 votes |
def feature_select(corpus, labels, k=1000): """ select top k features through chi-square test """ bin_cv = CountVectorizer(binary=True) le = LabelEncoder() X = bin_cv.fit_transform(corpus) y = le.fit_transform(labels).reshape(-1, 1) k = min(X.shape[1], k) skb = SelectKBest(chi2, k=k) skb.fit(X, y) feature_ids = skb.get_support(indices=True) feature_names = bin_cv.get_feature_names() vocab = {} for new_fid, old_fid in enumerate(feature_ids): feature_name = feature_names[old_fid] vocab[feature_name] = new_fid # we only care about the final extracted feature vocabulary return vocab
Example #3
Source File: train.py From skorch with BSD 3-Clause "New" or "Revised" License | 7 votes |
def get_model(with_pipeline=False): """Get a multi-layer perceptron model. Optionally, put it in a pipeline that scales the data. """ model = NeuralNetClassifier(MLPClassifier) if with_pipeline: model = Pipeline([ ('scale', FeatureUnion([ ('minmax', MinMaxScaler()), ('normalize', Normalizer()), ])), ('select', SelectKBest(k=N_FEATURES)), # keep input size constant ('net', model), ]) return model
Example #4
Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0 | 7 votes |
def ReducedFeaturesDF(X,y): ''' Returns a dataframe with only a subset of features/columns retained ''' from sklearn.feature_selection import RFE est = LinearSVC( penalty='l1', loss='l2', dual=False, class_weight='auto') # selectK = SelectKBest(score_func = f_classif, k=45) selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15) selectK=selectRFE selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = feature_names[selectK_mask] print ("reduced RFE features:") print(K_featnames) Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] # Reduced_df.to_csv('REDUCED_Feat.csv') return Reduced_df # ReducedFeaturesDF(X,y) # z=pd.DataFrame(data=X_SGD,index=y) # z.to_csv('REDUCED_Feat.csv')
Example #5
Source File: FeatureSelector.py From FAE with GNU General Public License v3.0 | 6 votes |
def GetSelectedFeatureIndex(self, data_container): data = data_container.GetArray() data /= np.linalg.norm(data, ord=2, axis=0) label = data_container.GetLabel() if data.shape[1] < self.GetSelectedFeatureNumber(): print( 'ANOVA: The number of features {:d} in data container is smaller than the required number {:d}'.format( data.shape[1], self.GetSelectedFeatureNumber())) self.SetSelectedFeatureNumber(data.shape[1]) fs = SelectKBest(f_classif, k=self.GetSelectedFeatureNumber()) fs.fit(data, label) feature_index = fs.get_support(True) f_value, p_value = f_classif(data, label) return feature_index.tolist(), f_value, p_value
Example #6
Source File: dominance.py From dominance-analysis with MIT License | 6 votes |
def get_top_k(self): columns=list(self.data.columns.values) columns.remove(self.target) # remove intercept from top_k if(self.objective): top_k_vars=SelectKBest(f_regression, k=self.top_k) top_k_vars.fit_transform(self.data[columns], self.data[self.target]) else: columns.remove('intercept') try: top_k_vars=SelectKBest(chi2, k=self.top_k) top_k_vars.fit_transform(self.data[columns], self.data[self.target]) except: top_k_vars=SelectKBest(f_classif, k=self.top_k) top_k_vars.fit_transform(self.data[columns], self.data[self.target]) return [columns[i] for i in top_k_vars.get_support(indices=True)]
Example #7
Source File: GetMLPara.py From dr_droid with Apache License 2.0 | 6 votes |
def find_best_feature_selections(X,y): #select the best features usin different technique X_new = SelectKBest(chi2, k=80).fit_transform(X,y) X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y) X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y) X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y) X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y) print (X_new.shape) #selection_parameters_for_classfier(X_new,y) #print (y.shape) train_and_test(X_new,y) train_and_test(X_new1,y) train_and_test(X_new2,y) train_and_test(X_new22,y) train_and_test(X_new3,y) train_and_test(X_new4,y) #X,y = _dataset_sample() ################################PARAMETER Selected################################ #TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest
Example #8
Source File: test_core_pipeline.py From lale with Apache License 2.0 | 6 votes |
def test_export_to_sklearn_pipeline3(self): from lale.lib.lale import ConcatFeatures from lale.lib.sklearn import PCA from lale.lib.sklearn import KNeighborsClassifier, LogisticRegression, SVC from sklearn.feature_selection import SelectKBest from lale.lib.sklearn import Nystroem from sklearn.pipeline import FeatureUnion lale_pipeline = ((PCA() >> SelectKBest(k=2)) & (Nystroem(random_state = 42) >> SelectKBest(k=3)) & (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest(k=2) >> LogisticRegression() trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train) sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline() self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion) self.assertIsInstance(sklearn_pipeline.named_steps['selectkbest'], SelectKBest) from sklearn.linear_model import LogisticRegression self.assertIsInstance(sklearn_pipeline.named_steps['logisticregression'], LogisticRegression) self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)
Example #9
Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.feature_selection.GenericUnivariateSelect, fs.GenericUnivariateSelect) self.assertIs(df.feature_selection.SelectPercentile, fs.SelectPercentile) self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest) self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr) self.assertIs(df.feature_selection.SelectFromModel, fs.SelectFromModel) self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr) self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe) self.assertIs(df.feature_selection.RFE, fs.RFE) self.assertIs(df.feature_selection.RFECV, fs.RFECV) self.assertIs(df.feature_selection.VarianceThreshold, fs.VarianceThreshold)
Example #10
Source File: test_base.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_pipeline(self): from sklearn.feature_selection import SelectKBest from sklearn.feature_selection import f_regression from sklearn.pipeline import Pipeline diabetes = datasets.load_diabetes() models = ['OLS', 'GLS', 'WLS', 'GLSAR', 'QuantReg', 'GLM', 'RLM'] for model in models: klass = getattr(sm, model) selector = SelectKBest(f_regression, k=5) estimator = Pipeline([('selector', selector), ('reg', base.StatsModelsRegressor(klass))]) estimator.fit(diabetes.data, diabetes.target) result = estimator.predict(diabetes.data) data = SelectKBest(f_regression, k=5).fit_transform(diabetes.data, diabetes.target) expected = klass(diabetes.target, data).fit().predict(data) self.assert_numpy_array_almost_equal(result, expected)
Example #11
Source File: PipeTasks.py From ProFET with GNU General Public License v3.0 | 6 votes |
def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True): ''' Gets best features using chosen method (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr), then prints top K features' names (from featNames). If reduceMatrix = True, then also returns X reduced to the K best features. Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'.. Note, that effectiveyl, Any scikit learn method could be used, if correctly imported.. ''' #est = method() ''' Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented). Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv" ''' features, labels, lb_encoder,featureNames = load_data(filename) X, y = features, labels # change the names as ints back to strings class_names=lb_encoder.inverse_transform(y) print("Data and labels imported. PreFilter Feature matrix shape:") print(X.shape) selectK = SelectKBest(k=kbest) selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = featureNames[selectK_mask] print('X After K filter:',X.shape) print("K_featnames: %s" %(K_featnames)) if reduceMatrix ==True : Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] Reduced_df.to_csv('REDUCED_Feat.csv') print('Saved to REDUCED_Feat.csv') return Reduced_df #WORKS! But unreadable with too many features!
Example #12
Source File: Model_trainer.py From ProFET with GNU General Public License v3.0 | 6 votes |
def featureFitting(filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05, model=None): ''' Gets the K-best features (filtered by FDR, then select best ranked by t-test, more advanced options can be implemented). Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv" Returns new features matrix, FD scaler, and K-select scaler ''' a=alpha FD = SelectFdr(alpha=a) X = FD.fit_transform(X,y) selectK = SelectKBest(k=kbest) selectK.fit(X,y) selectK_mask=selectK.get_support() K_featnames = featureNames[selectK_mask] print("K_featnames: %s" %(K_featnames)) Reduced_df = pd.read_csv(filename, index_col=0) Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]] Reduced_df.to_csv('REDUCED_Feat.csv') return Reduced_df, FD, selectK
Example #13
Source File: model.py From student-performance-prediction with MIT License | 5 votes |
def train_and_score(X, y): X_train, X_test, y_train, y_test = split_data(X, y) clf = Pipeline([ ('reduce_dim', SelectKBest(chi2, k=2)), ('train', LinearSVC(C=100)) ]) scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2) print("Mean Model Accuracy:", np.array(scores).mean()) clf.fit(X_train, y_train) confuse(y_test, clf.predict(X_test)) print()
Example #14
Source File: utils.py From IoT-device-type-identification with MIT License | 5 votes |
def perform_feature_selection(X_train, y_train, k_val): """ This method is used in order to perform a feature selection by selecting the best k_val features from X_train. It does so according to the chi2 criterion. The method prints the chosen features and creates a new instance of X_train with only these features and returns it """ print("**********FEATURE SELECTION**********") # Create and fit selector selector = SelectKBest(chi2, k=k_val) selector.fit(X_train, y_train) # Get idxs of columns to keep idxs_selected = selector.get_support(indices=True) print(idxs_selected) x_new = SelectKBest(chi2, k=k_val).fit_transform(X_train, y_train) return x_new
Example #15
Source File: models.py From IoT-device-type-identification with MIT License | 5 votes |
def perform_feature_selection(X_train, y_train, k_val): """ This method is used in order to perform a feature selection by selecting the best k_val features from X_train. It does so according to the chi2 criterion. The method prints the chosen features and creates a new instance of X_train with only these features and returns it """ print("**********FEATURE SELECTION**********") # Create and fit selector selector = SelectKBest(chi2, k=k_val) selector.fit(X_train, y_train) #Get idxs of columns to keep idxs_selected = selector.get_support(indices=True) print(idxs_selected) X_new = SelectKBest(chi2, k=k_val).fit_transform(X_train, y_train) return X_new
Example #16
Source File: test_bagging.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_bagging_with_pipeline(): estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1), DecisionTreeClassifier()), max_features=2) estimator.fit(iris.data, iris.target) assert_true(isinstance(estimator[0].steps[-1][1].random_state, int))
Example #17
Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_feature_selection(): # make two feature dicts with two useful features and a bunch of useless # ones, in terms of chi2 d1 = dict([("useless%d" % i, 10) for i in range(20)], useful1=1, useful2=20) d2 = dict([("useless%d" % i, 10) for i in range(20)], useful1=20, useful2=1) for indices in (True, False): v = DictVectorizer().fit([d1, d2]) X = v.transform([d1, d2]) sel = SelectKBest(chi2, k=2).fit(X, [0, 1]) v.restrict(sel.get_support(indices=indices), indices=indices) assert_equal(v.get_feature_names(), ["useful1", "useful2"])
Example #18
Source File: test_chi2.py From twitter-stock-recommendation with MIT License | 5 votes |
def mkchi2(k): """Make k-best chi2 selector""" return SelectKBest(chi2, k=k)
Example #19
Source File: test_pipeline.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_pipeline_methods_anova(): # Test the various methods of the pipeline (anova). iris = load_iris() X = iris.data y = iris.target # Test with Anova + LogisticRegression clf = LogisticRegression() filter1 = SelectKBest(f_classif, k=2) pipe = Pipeline([('anova', filter1), ('logistic', clf)]) pipe.fit(X, y) pipe.predict(X) pipe.predict_proba(X) pipe.predict_log_proba(X) pipe.score(X, y)
Example #20
Source File: GetMLPara.py From dr_droid with Apache License 2.0 | 5 votes |
def precision_recall_curve_draw(X_o,y): X = SelectKBest(f_classif, k=80).fit_transform(X_o,y) print (X.shape) print (y.shape) svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best') rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) p_svmrbf, r_svmrbf, auc_svmrbf = get_my_pecision_recall(svmrbf,X,y) p_knn, r_knn, auc_knn = get_my_pecision_recall(knn, X, y) p_dtree, r_dtree, auc_dtree = get_my_pecision_recall(dtree, X, y) p_rforest, r_rforest, auc_rforest = get_my_pecision_recall(rforest, X, y) plt.clf() plt.plot(r_svmrbf,p_svmrbf, 'y.--', label ='SVM auc=%0.3f'% auc_svmrbf) plt.plot(r_knn, p_knn, 'r^--', label='KNN auc=%0.3f' %auc_knn) plt.plot(r_dtree, p_dtree, 'b>--', label ='Decision Tree auc=%0.3f'% auc_dtree) plt.plot(r_rforest, p_rforest, 'go--', label ='Random Forest auc=%0.3f'% auc_rforest) plt.xlim([0.0, 1.0]) plt.ylim([0.0, 1.0]) plt.xlabel('recall rate') plt.ylabel('precision rate') plt.title('precision-recall curve') plt.legend(loc="lower right") plt.show() del X del y ###############################################Examples to show the difference of features representation ##################################
Example #21
Source File: GetMLPara.py From dr_droid with Apache License 2.0 | 5 votes |
def my_get_fp_fn_CV(X_original,y): #generate classfiers knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform') #decision tree dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best') #naive #nbbern = BernoulliNB() #random forest rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto', min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3) #svm svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3, kernel='rbf', max_iter=-1, probability=True, random_state=None, shrinking=True, tol=0.001, verbose=False) #reduce the size #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y) skb = SelectKBest(f_classif, k=80).fit(X_original,y) X = skb.fit_transform(X_original,y) print ("KNN") my_get_fp_fn_inter(knn,X,y) print ("DTree") my_get_fp_fn_inter(dtree,X,y) print ("rforest") my_get_fp_fn_inter(rforest,X,y) #print ("naive bayes") #my_get_fp_fn_inter(nbbern,X,y) print ("SVMrbf") my_get_fp_fn_inter(svmrbf,X,y)
Example #22
Source File: test_pipeline.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_feature_union_weights(): # test feature union with transformer weights iris = load_iris() X = iris.data y = iris.target pca = PCA(n_components=2, svd_solver='randomized', random_state=0) select = SelectKBest(k=1) # test using fit followed by transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) fs.fit(X, y) X_transformed = fs.transform(X) # test using fit_transform fs = FeatureUnion([("pca", pca), ("select", select)], transformer_weights={"pca": 10}) X_fit_transformed = fs.fit_transform(X, y) # test it works with transformers missing fit_transform fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)], transformer_weights={"mock": 10}) X_fit_transformed_wo_method = fs.fit_transform(X, y) # check against expected result # We use a different pca object to control the random_state stream assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_array_almost_equal(X_fit_transformed[:, :-1], 10 * pca.fit_transform(X)) assert_array_equal(X_fit_transformed[:, -1], select.fit_transform(X, y).ravel()) assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))
Example #23
Source File: test_pipeline.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) assert_raises(AttributeError, getattr, reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) assert_raises(AttributeError, getattr, clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
Example #24
Source File: kerasExperiments.py From emailinsight with MIT License | 5 votes |
def select_best_features(dataset, train_labels, num_best, verbose=True): (X_train, Y_train), (X_test, Y_test) = dataset if verbose: print('\nSelecting %d best features\n'%num_best) selector = SelectKBest(chi2, k=num_best) X_train = selector.fit_transform(X_train,train_labels) X_test = selector.transform(X_test) return ((X_train, Y_train), (X_test, Y_test)),selector.scores_
Example #25
Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0 | 5 votes |
def _select_K_best(self, k): if self._problem == FeatureSelector.CLASSIFICATION: score = f_classif else: score = f_regression self._selector = SelectKBest(score, k)
Example #26
Source File: FeatureSelector.py From FAE with GNU General Public License v3.0 | 5 votes |
def GetSelectedFeatureIndex(self, data_container): data = data_container.GetArray() data /= np.linalg.norm(data, ord=2, axis=0) label = data_container.GetLabel() if data.shape[1] < self.GetSelectedFeatureNumber(): print('KW: The number of features {:d} in data container is smaller than the required number {:d}'.format( data.shape[1], self.GetSelectedFeatureNumber())) self.SetSelectedFeatureNumber(data.shape[1]) fs = SelectKBest(self.KruskalWallisAnalysis, k=self.GetSelectedFeatureNumber()) fs.fit(data, label) feature_index = fs.get_support(True) self._f_value, self._p_value = self.KruskalWallisAnalysis(data, label) return feature_index.tolist()
Example #27
Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0 | 5 votes |
def test_fit(self): selector = SelectKBest(score_func = f_regression, k = 1) selector_proxy = SelectorProxy(selector) self.assertFalse(hasattr(selector_proxy, "support_mask_")) selector_proxy.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5, 1.0])) self.assertEqual([0, 1], selector._get_support_mask().tolist()) self.assertEqual([0, 1], selector_proxy.support_mask_.tolist())
Example #28
Source File: test_core_pipeline.py From lale with Apache License 2.0 | 5 votes |
def test_import_from_sklearn_pipeline_nested_pipeline1(self): from sklearn.pipeline import FeatureUnion, make_pipeline from sklearn.decomposition import PCA from sklearn.kernel_approximation import Nystroem from sklearn.feature_selection import SelectKBest from sklearn.neighbors import KNeighborsClassifier from sklearn.pipeline import make_pipeline union = FeatureUnion([("selectkbest_pca", make_pipeline(SelectKBest(k=3), FeatureUnion([('pca', PCA(n_components=1)), ('nested_pipeline', make_pipeline(SelectKBest(k=2), Nystroem()))]))), ("nys", Nystroem(n_components=2, random_state=42))]) sklearn_pipeline = make_pipeline(union, KNeighborsClassifier()) lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline) self.assertEqual(len(lale_pipeline.edges()), 8) #These assertions assume topological sort, which may not be unique. So the assertions are brittle. from lale.lib.sklearn.pca import PCAImpl from lale.lib.sklearn.nystroem import NystroemImpl from lale.lib.lale.concat_features import ConcatFeaturesImpl from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl from lale.lib.sklearn.select_k_best import SelectKBestImpl self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), PCAImpl) self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), SelectKBestImpl) self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[3][0]._impl_class(), PCAImpl) self.assertEqual(lale_pipeline.edges()[3][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[4][0]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[4][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[5][0]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[5][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[6][0]._impl_class(), NystroemImpl) self.assertEqual(lale_pipeline.edges()[6][1]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[7][0]._impl_class(), ConcatFeaturesImpl) self.assertEqual(lale_pipeline.edges()[7][1]._impl_class(), KNeighborsClassifierImpl) self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)
Example #29
Source File: test_pipeline.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_classes_property(): iris = load_iris() X = iris.data y = iris.target reg = make_pipeline(SelectKBest(k=1), LinearRegression()) reg.fit(X, y) assert_raises(AttributeError, getattr, reg, "classes_") clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0)) assert_raises(AttributeError, getattr, clf, "classes_") clf.fit(X, y) assert_array_equal(clf.classes_, np.unique(y))
Example #30
Source File: svm_classifier.py From nlp-journey with Apache License 2.0 | 5 votes |
def __select_features(data_set): dataset = [clean_en_text(data) for data in data_set[0]] tf_idf_model = TfidfVectorizer(ngram_range=(1, 1), binary=True, sublinear_tf=True) tf_vectors = tf_idf_model.fit_transform(dataset) # 选出前1/5的词用来做特征 k = int(tf_vectors.shape[1] / 6) chi_model = SelectKBest(chi2, k=k) chi_features = chi_model.fit_transform(tf_vectors, data_set[1]) print('tf-idf:\t\t' + str(tf_vectors.shape[1])) print('chi:\t\t' + str(chi_features.shape[1])) return chi_features, tf_idf_model, chi_model