Python Examples of sklearn.feature

Source File: DataAnalysis.py From Predicting-Health-Insurance-Cost with BSD 3-Clause "New" or "Revised" License

8 votes

def featuresFromFeatureSelection(X,Y,columnNames):
    
    for f in columnNames:
        print(f)
    X_new_withfitTransform = SelectKBest(chi2, k=34).fit(X, Y)
    colors = getColorNames()
    counter  = 0
    
    scores = X_new_withfitTransform.scores_
    scores_scaled = np.divide(scores, 1000) 
        
    for score in scores_scaled:
        #if(score > 10):
        #print('Feature {:>34}'.format(columnNames[counter]))
        print('{:>34}  '.format( score))
        '''Plot a graph'''    
        plt.bar(counter, score,color=colors[counter])
        counter +=1 

    plt.ylabel('Scores(1k)')
    plt.title('Scores calculated by Chi-Square Test')
    plt.legend(columnNames, bbox_to_anchor=(0., 0.8, 1., .102), loc=3,ncol=5, mode="expand", borderaxespad=0.)
    plt.show()
    
    #print(feature_selection.chi2(X,Y))

Source File: tester.py From Text-Classification-Benchmark with MIT License

7 votes

def feature_select(corpus, labels, k=1000):
    """
    select top k features through chi-square test
    """
    bin_cv = CountVectorizer(binary=True)
    le = LabelEncoder()
    X = bin_cv.fit_transform(corpus)
    y = le.fit_transform(labels).reshape(-1, 1)

    k = min(X.shape[1], k)
    skb = SelectKBest(chi2, k=k)
    skb.fit(X, y)

    feature_ids = skb.get_support(indices=True)
    feature_names = bin_cv.get_feature_names()
    vocab = {}

    for new_fid, old_fid in enumerate(feature_ids):
        feature_name = feature_names[old_fid]
        vocab[feature_name] = new_fid

    # we only care about the final extracted feature vocabulary
    return vocab

Source File: train.py From skorch with BSD 3-Clause "New" or "Revised" License

7 votes

def get_model(with_pipeline=False):
    """Get a multi-layer perceptron model.

    Optionally, put it in a pipeline that scales the data.

    """
    model = NeuralNetClassifier(MLPClassifier)
    if with_pipeline:
        model = Pipeline([
            ('scale', FeatureUnion([
                ('minmax', MinMaxScaler()),
                ('normalize', Normalizer()),
            ])),
            ('select', SelectKBest(k=N_FEATURES)),  # keep input size constant
            ('net', model),
        ])
    return model

Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0

7 votes

def ReducedFeaturesDF(X,y):
        '''
        Returns a dataframe with only a subset of features/columns retained
        '''
        from sklearn.feature_selection import RFE
        est = LinearSVC( penalty='l1', loss='l2', dual=False, class_weight='auto')
#        selectK = SelectKBest(score_func = f_classif, k=45)
        selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15)
        selectK=selectRFE

        selectK.fit(X,y)
        selectK_mask=selectK.get_support()
        K_featnames = feature_names[selectK_mask]
        print ("reduced RFE features:")
        print(K_featnames)
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
#        Reduced_df.to_csv('REDUCED_Feat.csv')
        return Reduced_df

#    ReducedFeaturesDF(X,y)
    # z=pd.DataFrame(data=X_SGD,index=y)
    # z.to_csv('REDUCED_Feat.csv')

Source File: FeatureSelector.py From FAE with GNU General Public License v3.0

6 votes

def GetSelectedFeatureIndex(self, data_container):
        data = data_container.GetArray()
        data /= np.linalg.norm(data, ord=2, axis=0)
        label = data_container.GetLabel()

        if data.shape[1] < self.GetSelectedFeatureNumber():
            print(
                'ANOVA: The number of features {:d} in data container is smaller than the required number {:d}'.format(
                    data.shape[1], self.GetSelectedFeatureNumber()))
            self.SetSelectedFeatureNumber(data.shape[1])

        fs = SelectKBest(f_classif, k=self.GetSelectedFeatureNumber())
        fs.fit(data, label)
        feature_index = fs.get_support(True)
        f_value, p_value = f_classif(data, label)
        return feature_index.tolist(), f_value, p_value

Source File: dominance.py From dominance-analysis with MIT License

6 votes

def get_top_k(self):
		columns=list(self.data.columns.values)
		columns.remove(self.target)
		# remove intercept from top_k
		if(self.objective):
			top_k_vars=SelectKBest(f_regression, k=self.top_k)
			top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		else:
			columns.remove('intercept')
			try:
				top_k_vars=SelectKBest(chi2, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
			except:
				top_k_vars=SelectKBest(f_classif, k=self.top_k)
				top_k_vars.fit_transform(self.data[columns], self.data[self.target])
		return [columns[i] for i in top_k_vars.get_support(indices=True)]

Source File: GetMLPara.py From dr_droid with Apache License 2.0

6 votes

def find_best_feature_selections(X,y):

    #select the best features usin different technique
    X_new = SelectKBest(chi2, k=80).fit_transform(X,y)
    X_new1 = SelectPercentile(chi2, percentile=20).fit_transform(X,y)

    X_new2 = SelectKBest(f_classif, k=80).fit_transform(X,y) #this one has the best performance
    X_new22 = SelectPercentile(f_classif, percentile=20).fit_transform(X,y)

    X_new3 = SelectKBest(f_classif, k=70).fit_transform(X,y)
    X_new4 = SelectKBest(f_classif, k=60).fit_transform(X,y)

    print (X_new.shape)
    #selection_parameters_for_classfier(X_new,y)
    #print (y.shape)
    train_and_test(X_new,y)
    train_and_test(X_new1,y)
    train_and_test(X_new2,y)
    train_and_test(X_new22,y)
    train_and_test(X_new3,y)
    train_and_test(X_new4,y)
    #X,y = _dataset_sample()

################################PARAMETER  Selected################################
#TODO some problem happens when using the parameter max_leaf_nodes in Dtree and RandomForest

Source File: test_core_pipeline.py From lale with Apache License 2.0

6 votes

def test_export_to_sklearn_pipeline3(self):
        from lale.lib.lale import ConcatFeatures
        from lale.lib.sklearn import PCA
        from lale.lib.sklearn import KNeighborsClassifier, LogisticRegression, SVC 
        from sklearn.feature_selection import SelectKBest
        from lale.lib.sklearn import Nystroem
        from sklearn.pipeline import FeatureUnion

        lale_pipeline = ((PCA() >> SelectKBest(k=2)) & (Nystroem(random_state = 42) >> SelectKBest(k=3))
         & (SelectKBest(k=3))) >> ConcatFeatures() >> SelectKBest(k=2) >> LogisticRegression()
        trained_lale_pipeline = lale_pipeline.fit(self.X_train, self.y_train)
        sklearn_pipeline = trained_lale_pipeline.export_to_sklearn_pipeline()
        self.assertIsInstance(sklearn_pipeline.named_steps['featureunion'], FeatureUnion)
        self.assertIsInstance(sklearn_pipeline.named_steps['selectkbest'], SelectKBest)
        from sklearn.linear_model import LogisticRegression
        self.assertIsInstance(sklearn_pipeline.named_steps['logisticregression'], LogisticRegression)
        self.assert_equal_predictions(sklearn_pipeline, trained_lale_pipeline)

Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_selection.GenericUnivariateSelect,
                      fs.GenericUnivariateSelect)
        self.assertIs(df.feature_selection.SelectPercentile,
                      fs.SelectPercentile)
        self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest)
        self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr)
        self.assertIs(df.feature_selection.SelectFromModel,
                      fs.SelectFromModel)
        self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr)
        self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe)
        self.assertIs(df.feature_selection.RFE, fs.RFE)
        self.assertIs(df.feature_selection.RFECV, fs.RFECV)
        self.assertIs(df.feature_selection.VarianceThreshold,
                      fs.VarianceThreshold)

Source File: test_base.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_pipeline(self):
        from sklearn.feature_selection import SelectKBest
        from sklearn.feature_selection import f_regression
        from sklearn.pipeline import Pipeline

        diabetes = datasets.load_diabetes()
        models = ['OLS', 'GLS', 'WLS', 'GLSAR', 'QuantReg', 'GLM', 'RLM']

        for model in models:
            klass = getattr(sm, model)

            selector = SelectKBest(f_regression, k=5)
            estimator = Pipeline([('selector', selector),
                                  ('reg', base.StatsModelsRegressor(klass))])

            estimator.fit(diabetes.data, diabetes.target)
            result = estimator.predict(diabetes.data)

            data = SelectKBest(f_regression, k=5).fit_transform(diabetes.data, diabetes.target)
            expected = klass(diabetes.target, data).fit().predict(data)
            self.assert_numpy_array_almost_equal(result, expected)

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

6 votes

def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
    '''
    Gets best features using chosen method
    (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
    then prints top K features' names (from featNames).
    If reduceMatrix =  True, then also returns X reduced to the K best features.

    Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
    Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
    '''
    #est = method()
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    features, labels, lb_encoder,featureNames = load_data(filename)
    X, y = features, labels

    # change the names as ints back to strings
    class_names=lb_encoder.inverse_transform(y)
    print("Data and labels imported. PreFilter Feature matrix shape:")
    print(X.shape)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print('X After K filter:',X.shape)
    print("K_featnames: %s" %(K_featnames))
    if reduceMatrix ==True :
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
        Reduced_df.to_csv('REDUCED_Feat.csv')
        print('Saved to REDUCED_Feat.csv')
        return Reduced_df

#WORKS! But unreadable with too many features!

Source File: Model_trainer.py From ProFET with GNU General Public License v3.0

6 votes

def featureFitting(filename, X, y, featureNames,optimalFlag, kbest=20, alpha=0.05, model=None):
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test, more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    Returns new features matrix, FD scaler, and K-select scaler
    '''
    a=alpha
    FD = SelectFdr(alpha=a)
    X = FD.fit_transform(X,y)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print("K_featnames: %s" %(K_featnames))
    Reduced_df = pd.read_csv(filename, index_col=0)
    Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
    Reduced_df.to_csv('REDUCED_Feat.csv')
    return Reduced_df, FD, selectK

Source File: model.py From student-performance-prediction with MIT License

5 votes

def train_and_score(X, y):
    X_train, X_test, y_train, y_test = split_data(X, y)

    clf = Pipeline([
        ('reduce_dim', SelectKBest(chi2, k=2)),
        ('train', LinearSVC(C=100))
    ])

    scores = cross_val_score(clf, X_train, y_train, cv=5, n_jobs=2)
    print("Mean Model Accuracy:", np.array(scores).mean())

    clf.fit(X_train, y_train)

    confuse(y_test, clf.predict(X_test))
    print()

Source File: utils.py From IoT-device-type-identification with MIT License

5 votes

def perform_feature_selection(X_train, y_train, k_val):
    """ This method is used in order to perform a feature selection by selecting
    the best k_val features from X_train. It does so according to the chi2
    criterion. The method prints the chosen features and creates
    a new instance of X_train with only these features and returns it
    """
    print("**********FEATURE SELECTION**********")
    # Create and fit selector
    selector = SelectKBest(chi2, k=k_val)
    selector.fit(X_train, y_train)
    # Get idxs of columns to keep
    idxs_selected = selector.get_support(indices=True)
    print(idxs_selected)
    x_new = SelectKBest(chi2, k=k_val).fit_transform(X_train, y_train)
    return x_new

Source File: models.py From IoT-device-type-identification with MIT License

5 votes

def perform_feature_selection(X_train, y_train, k_val):
    """ This method is used in order to perform a feature selection by selecting
    the best k_val features from X_train. It does so according to the chi2
    criterion. The method prints the chosen features and creates
    a new instance of X_train with only these features and returns it 
    """
    print("**********FEATURE SELECTION**********")
    # Create and fit selector
    selector = SelectKBest(chi2, k=k_val)
    selector.fit(X_train, y_train)
    #Get idxs of columns to keep
    idxs_selected = selector.get_support(indices=True)
    print(idxs_selected)
    X_new = SelectKBest(chi2, k=k_val).fit_transform(X_train, y_train)
    return X_new

Source File: test_bagging.py From twitter-stock-recommendation with MIT License

5 votes

def test_bagging_with_pipeline():
    estimator = BaggingClassifier(make_pipeline(SelectKBest(k=1),
                                                DecisionTreeClassifier()),
                                  max_features=2)
    estimator.fit(iris.data, iris.target)
    assert_true(isinstance(estimator[0].steps[-1][1].random_state,
                           int))

Source File: test_dict_vectorizer.py From twitter-stock-recommendation with MIT License

5 votes

def test_feature_selection():
    # make two feature dicts with two useful features and a bunch of useless
    # ones, in terms of chi2
    d1 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=1, useful2=20)
    d2 = dict([("useless%d" % i, 10) for i in range(20)],
              useful1=20, useful2=1)

    for indices in (True, False):
        v = DictVectorizer().fit([d1, d2])
        X = v.transform([d1, d2])
        sel = SelectKBest(chi2, k=2).fit(X, [0, 1])

        v.restrict(sel.get_support(indices=indices), indices=indices)
        assert_equal(v.get_feature_names(), ["useful1", "useful2"])

Source File: test_chi2.py From twitter-stock-recommendation with MIT License

5 votes

def mkchi2(k):
    """Make k-best chi2 selector"""
    return SelectKBest(chi2, k=k)

Source File: test_pipeline.py From twitter-stock-recommendation with MIT License

5 votes

def test_pipeline_methods_anova():
    # Test the various methods of the pipeline (anova).
    iris = load_iris()
    X = iris.data
    y = iris.target
    # Test with Anova + LogisticRegression
    clf = LogisticRegression()
    filter1 = SelectKBest(f_classif, k=2)
    pipe = Pipeline([('anova', filter1), ('logistic', clf)])
    pipe.fit(X, y)
    pipe.predict(X)
    pipe.predict_proba(X)
    pipe.predict_log_proba(X)
    pipe.score(X, y)

Source File: GetMLPara.py From dr_droid with Apache License 2.0

5 votes

def precision_recall_curve_draw(X_o,y):

    X = SelectKBest(f_classif, k=80).fit_transform(X_o,y)
    print (X.shape)
    print (y.shape)

    svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=1, min_samples_split=2, random_state=None, splitter='best')

    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',  min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)

    p_svmrbf, r_svmrbf, auc_svmrbf = get_my_pecision_recall(svmrbf,X,y)


    p_knn, r_knn, auc_knn = get_my_pecision_recall(knn, X, y)
    p_dtree, r_dtree, auc_dtree = get_my_pecision_recall(dtree, X, y)
    p_rforest, r_rforest, auc_rforest = get_my_pecision_recall(rforest, X, y)

    plt.clf()
    plt.plot(r_svmrbf,p_svmrbf, 'y.--', label ='SVM auc=%0.3f'% auc_svmrbf)
    plt.plot(r_knn, p_knn, 'r^--', label='KNN auc=%0.3f' %auc_knn)
    plt.plot(r_dtree, p_dtree, 'b>--', label ='Decision Tree auc=%0.3f'% auc_dtree)
    plt.plot(r_rforest, p_rforest, 'go--', label ='Random Forest auc=%0.3f'% auc_rforest)

    plt.xlim([0.0, 1.0])
    plt.ylim([0.0, 1.0])
    plt.xlabel('recall rate')
    plt.ylabel('precision rate')
    plt.title('precision-recall curve')
    plt.legend(loc="lower right")
    plt.show()

    del X
    del y
###############################################Examples to show the difference of features representation ##################################

Source File: GetMLPara.py From dr_droid with Apache License 2.0

5 votes

def my_get_fp_fn_CV(X_original,y):

    #generate classfiers
    knn = KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski', n_neighbors=5, p=2, weights='uniform')

    #decision tree
    dtree = DecisionTreeClassifier( criterion='gini', min_samples_leaf=4, min_samples_split=2, random_state=None, splitter='best')

    #naive
    #nbbern = BernoulliNB()

    #random forest
    rforest = RandomForestClassifier(bootstrap=True, criterion='gini', max_depth=None, max_features='auto',  min_samples_leaf=1, min_samples_split=2, n_estimators=10, n_jobs=1, oob_score=False, random_state=3)

    #svm
    svmrbf= svm.SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0, degree=3,  kernel='rbf', max_iter=-1, probability=True, random_state=None,
shrinking=True, tol=0.001, verbose=False)

    #reduce the size
    #X = SelectKBest(f_classif, k=80).fit_transform(X_original,y)
    skb = SelectKBest(f_classif, k=80).fit(X_original,y)
    X = skb.fit_transform(X_original,y)

    print ("KNN")
    my_get_fp_fn_inter(knn,X,y)
    print ("DTree")
    my_get_fp_fn_inter(dtree,X,y)
    print ("rforest")
    my_get_fp_fn_inter(rforest,X,y)
    #print ("naive bayes")
    #my_get_fp_fn_inter(nbbern,X,y)
    print ("SVMrbf")
    my_get_fp_fn_inter(svmrbf,X,y)

Source File: test_pipeline.py From twitter-stock-recommendation with MIT License

5 votes

def test_feature_union_weights():
    # test feature union with transformer weights
    iris = load_iris()
    X = iris.data
    y = iris.target
    pca = PCA(n_components=2, svd_solver='randomized', random_state=0)
    select = SelectKBest(k=1)
    # test using fit followed by transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    fs.fit(X, y)
    X_transformed = fs.transform(X)
    # test using fit_transform
    fs = FeatureUnion([("pca", pca), ("select", select)],
                      transformer_weights={"pca": 10})
    X_fit_transformed = fs.fit_transform(X, y)
    # test it works with transformers missing fit_transform
    fs = FeatureUnion([("mock", Transf()), ("pca", pca), ("select", select)],
                      transformer_weights={"mock": 10})
    X_fit_transformed_wo_method = fs.fit_transform(X, y)
    # check against expected result

    # We use a different pca object to control the random_state stream
    assert_array_almost_equal(X_transformed[:, :-1], 10 * pca.fit_transform(X))
    assert_array_equal(X_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_array_almost_equal(X_fit_transformed[:, :-1],
                              10 * pca.fit_transform(X))
    assert_array_equal(X_fit_transformed[:, -1],
                       select.fit_transform(X, y).ravel())
    assert_equal(X_fit_transformed_wo_method.shape, (X.shape[0], 7))

Source File: test_pipeline.py From twitter-stock-recommendation with MIT License

5 votes

def test_classes_property():
    iris = load_iris()
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    assert_raises(AttributeError, getattr, reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
    assert_raises(AttributeError, getattr, clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))

Source File: kerasExperiments.py From emailinsight with MIT License

5 votes

def select_best_features(dataset, train_labels, num_best, verbose=True):
    (X_train, Y_train), (X_test, Y_test) = dataset
    if verbose:
        print('\nSelecting %d best features\n'%num_best)
    selector = SelectKBest(chi2, k=num_best)
    X_train = selector.fit_transform(X_train,train_labels)
    X_test = selector.transform(X_test)
    return ((X_train, Y_train), (X_test, Y_test)),selector.scores_

Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0

5 votes

def _select_K_best(self, k):
        if self._problem == FeatureSelector.CLASSIFICATION:
            score = f_classif
        else:
            score = f_regression

        self._selector = SelectKBest(score, k)

Source File: FeatureSelector.py From FAE with GNU General Public License v3.0

5 votes

def GetSelectedFeatureIndex(self, data_container):
        data = data_container.GetArray()
        data /= np.linalg.norm(data, ord=2, axis=0)
        label = data_container.GetLabel()

        if data.shape[1] < self.GetSelectedFeatureNumber():
            print('KW: The number of features {:d} in data container is smaller than the required number {:d}'.format(
                data.shape[1], self.GetSelectedFeatureNumber()))
            self.SetSelectedFeatureNumber(data.shape[1])

        fs = SelectKBest(self.KruskalWallisAnalysis, k=self.GetSelectedFeatureNumber())
        fs.fit(data, label)
        feature_index = fs.get_support(True)
        self._f_value, self._p_value = self.KruskalWallisAnalysis(data, label)
        return feature_index.tolist()

Source File: __init__.py From sklearn2pmml with GNU Affero General Public License v3.0

5 votes

def test_fit(self):
		selector = SelectKBest(score_func = f_regression, k = 1)
		selector_proxy = SelectorProxy(selector)
		self.assertFalse(hasattr(selector_proxy, "support_mask_"))
		selector_proxy.fit(numpy.array([[0, 0], [1.0, 2.0]]), numpy.array([0.5, 1.0]))
		self.assertEqual([0, 1], selector._get_support_mask().tolist())
		self.assertEqual([0, 1], selector_proxy.support_mask_.tolist())

Source File: test_core_pipeline.py From lale with Apache License 2.0

5 votes

def test_import_from_sklearn_pipeline_nested_pipeline1(self):
        from sklearn.pipeline import FeatureUnion, make_pipeline       
        from sklearn.decomposition import PCA
        from sklearn.kernel_approximation import Nystroem
        from sklearn.feature_selection import SelectKBest
        from sklearn.neighbors import KNeighborsClassifier
        from sklearn.pipeline import make_pipeline
        union = FeatureUnion([("selectkbest_pca", make_pipeline(SelectKBest(k=3), FeatureUnion([('pca', PCA(n_components=1)), ('nested_pipeline', make_pipeline(SelectKBest(k=2), Nystroem()))]))), ("nys", Nystroem(n_components=2, random_state=42))])        
        sklearn_pipeline = make_pipeline(union, KNeighborsClassifier())
        lale_pipeline = import_from_sklearn_pipeline(sklearn_pipeline)
        self.assertEqual(len(lale_pipeline.edges()), 8)
        #These assertions assume topological sort, which may not be unique. So the assertions are brittle.
        from lale.lib.sklearn.pca import PCAImpl
        from lale.lib.sklearn.nystroem import NystroemImpl
        from lale.lib.lale.concat_features import ConcatFeaturesImpl
        from lale.lib.sklearn.k_neighbors_classifier import KNeighborsClassifierImpl
        from lale.lib.sklearn.select_k_best import SelectKBestImpl
        self.assertEqual(lale_pipeline.edges()[0][0]._impl_class(), SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[0][1]._impl_class(), PCAImpl)
        self.assertEqual(lale_pipeline.edges()[1][0]._impl_class(), SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[1][1]._impl_class(), SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[2][0]._impl_class(), SelectKBestImpl)
        self.assertEqual(lale_pipeline.edges()[2][1]._impl_class(), NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[3][0]._impl_class(), PCAImpl)
        self.assertEqual(lale_pipeline.edges()[3][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[4][0]._impl_class(), NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[4][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[5][0]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[5][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[6][0]._impl_class(), NystroemImpl)
        self.assertEqual(lale_pipeline.edges()[6][1]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[7][0]._impl_class(), ConcatFeaturesImpl)
        self.assertEqual(lale_pipeline.edges()[7][1]._impl_class(), KNeighborsClassifierImpl)
        self.assert_equal_predictions(sklearn_pipeline, lale_pipeline)

Source File: test_pipeline.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_classes_property():
    iris = load_iris()
    X = iris.data
    y = iris.target

    reg = make_pipeline(SelectKBest(k=1), LinearRegression())
    reg.fit(X, y)
    assert_raises(AttributeError, getattr, reg, "classes_")

    clf = make_pipeline(SelectKBest(k=1), LogisticRegression(random_state=0))
    assert_raises(AttributeError, getattr, clf, "classes_")
    clf.fit(X, y)
    assert_array_equal(clf.classes_, np.unique(y))

Source File: svm_classifier.py From nlp-journey with Apache License 2.0

5 votes

def __select_features(data_set):
        dataset = [clean_en_text(data) for data in data_set[0]]
        tf_idf_model = TfidfVectorizer(ngram_range=(1, 1),
                                       binary=True, 
                                       sublinear_tf=True)
        tf_vectors = tf_idf_model.fit_transform(dataset)

        # 选出前1/5的词用来做特征
        k = int(tf_vectors.shape[1] / 6)
        chi_model = SelectKBest(chi2, k=k)
        chi_features = chi_model.fit_transform(tf_vectors, data_set[1])
        print('tf-idf:\t\t' + str(tf_vectors.shape[1]))
        print('chi:\t\t' + str(chi_features.shape[1]))

        return chi_features, tf_idf_model, chi_model

Python sklearn.feature_selection.SelectKBest() Examples