Python Examples of sklearn.feature

Source File: utils_feature_selection.py From auto_ml with MIT License

7 votes

def get_feature_selection_model_from_name(type_of_estimator, model_name):
    model_map = {
        'classifier': {
            'SelectFromModel': SelectFromModel(RandomForestClassifier(n_jobs=-1, max_depth=10, n_estimators=15), threshold='20*mean'),
            'RFECV': RFECV(estimator=RandomForestClassifier(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'KeepAll': 'KeepAll'
        },
        'regressor': {
            'SelectFromModel': SelectFromModel(RandomForestRegressor(n_jobs=-1, max_depth=10, n_estimators=15), threshold='0.7*mean'),
            'RFECV': RFECV(estimator=RandomForestRegressor(n_jobs=-1), step=0.1),
            'GenericUnivariateSelect': GenericUnivariateSelect(),
            'KeepAll': 'KeepAll'
        }
    }

    return model_map[type_of_estimator][model_name]

Source File: FSRegression.py From CausalDiscoveryToolbox with MIT License

6 votes

def predict_features(self, df_features, df_target, idx=0, **kwargs):
        """For one variable, predict its neighbouring nodes.

        Args:
            df_features (pandas.DataFrame):
            df_target (pandas.Series):
            idx (int): (optional) for printing purposes
            kwargs (dict): additional options for algorithms

        Returns:
            list: scores of each feature relatively to the target
        """
        estimator = SVR(kernel='linear')
        selector = RFECV(estimator, step=1)
        selector = selector.fit(df_features.values, np.ravel(df_target.values))

        return selector.grid_scores_

Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_selection.GenericUnivariateSelect,
                      fs.GenericUnivariateSelect)
        self.assertIs(df.feature_selection.SelectPercentile,
                      fs.SelectPercentile)
        self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest)
        self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr)
        self.assertIs(df.feature_selection.SelectFromModel,
                      fs.SelectFromModel)
        self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr)
        self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe)
        self.assertIs(df.feature_selection.RFE, fs.RFE)
        self.assertIs(df.feature_selection.RFECV, fs.RFECV)
        self.assertIs(df.feature_selection.VarianceThreshold,
                      fs.VarianceThreshold)

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

6 votes

def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
    '''
    Gets best features using chosen method
    (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
    then prints top K features' names (from featNames).
    If reduceMatrix =  True, then also returns X reduced to the K best features.

    Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
    Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
    '''
    #est = method()
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    features, labels, lb_encoder,featureNames = load_data(filename)
    X, y = features, labels

    # change the names as ints back to strings
    class_names=lb_encoder.inverse_transform(y)
    print("Data and labels imported. PreFilter Feature matrix shape:")
    print(X.shape)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print('X After K filter:',X.shape)
    print("K_featnames: %s" %(K_featnames))
    if reduceMatrix ==True :
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
        Reduced_df.to_csv('REDUCED_Feat.csv')
        print('Saved to REDUCED_Feat.csv')
        return Reduced_df

#WORKS! But unreadable with too many features!

Source File: QuincyLearn.py From quincy with GNU General Public License v3.0

6 votes

def __select_features(self, X, y, feature_names):
        logging.info("Automagically extracting features with recursive feature eliminiation based on RandomForest")

        model = RandomForestClassifier(n_jobs=-1)
        rfe = RFECV(model, cv=QuincyConfig.CV, scoring=QuincyConfig.METRIC)
        fit = rfe.fit(X, y)
        logging.info("Number of selected features: %d" % fit.n_features_)

        discarded, selected = self.__get_discarded_and_selected_features(feature_names, fit)

        X = self.__drop_discarded_features(X, discarded)

        feature_selection_results = self.__get_feature_selection_results(X, discarded, feature_names, fit, model, selected, y)
        self._featureSelectionResults = feature_selection_results
        return X

Source File: RFECV.py From simba with GNU Lesser General Public License v3.0

5 votes

def perf_RFCVE(projectPath, RFCVE_CVs, RFCVE_step_size, clf, data_train, target_train):
    selector = RFECV(estimator=clf, step=RFCVE_step_size, cv=RFCVE_CVs, scoring='f1', verbose=1)
    selector = selector.fit(data_train, target_train)

    print(selector.support_)

Source File: nested_cv.py From Nested-Cross-Validation with MIT License

5 votes

def _fit_recursive_feature_elimination(self, X_train_outer, y_train_outer, X_test_outer):
        rfe = RFECV(estimator=self.model,
                    min_features_to_select=self.rfe_n_features, cv=self.inner_cv, n_jobs = self.n_jobs)
        rfe.fit(X_train_outer, y_train_outer)
        
        log.info('Best number of features was: {0}'.format(rfe.n_features_))

        # Assign selected features to data
        return rfe.transform(X_train_outer), rfe.transform(X_test_outer)

Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0

5 votes

def compute_ranks(self):
        if self._algorithm == FeatureSelector.SELECT_K_BEST:
            scores = self._selector.scores_
            sorted_scores = sorted(scores, reverse=True)
            ranks = [sorted_scores.index(i) + 1 for i in scores]
        elif self._algorithm == FeatureSelector.SELECT_PERCENTILE:
            scores = self._selector.scores_
            sorted_scores = sorted(scores, reverse=True)
            ranks = [sorted_scores.index(i) + 1 for i in scores]
        elif self._algorithm == FeatureSelector.RECURSIVE_ELIMINATION:
            n_selected = self._selector.n_features_
            support = self._selector.support_
            ranking = self._selector.ranking_
            # RFE and RFECV do not provide feature scores. Instead, they
            # provide a list of features which have been selected (support)
            # and an ascending list indicating when each other feature was
            # eliminated. Use these two to construct feature ranks, though
            # acknowledge that RFE and RFECV do not actually distinguish between
            # the weights of selected features.
            ranks = [0]*len(support)
            selected_count = 0
            for i in range(len(ranking)):
                if support[i]:
                    # All selected features in ranking receive rank 1, so need
                    # to iterate through list and add incrementing values so
                    # that features ranked 1, 1, 1, become 1, 2, 3.
                    ranks[i] = ranking[i] + selected_count
                    selected_count += 1
                else:
                    # Even if there are 5 selected features, the 6th feature
                    # in ranking is given rank 2, so add (n_selected - 1).
                    ranks[i] = ranking[i] + (n_selected - 1)

        return ranks

Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0

5 votes

def _eliminate_recursively(self, k=None):
        if self._problem == FeatureSelector.CLASSIFICATION:
            estimator = RandomForestClassifier(random_state=self._random_state)
        else:
            estimator = LassoCV(random_state=self._random_state)
        # If k is not specified, then use RFECV to automatically decide on
        # optimal number of features. If specified, then use RFE.
        if k is None:
            self._selector = RFECV(estimator)
        else:
            self._selector = RFE(estimator, n_features_to_select=k, step=0.05)

Source File: wine.py From UCI-Data-Analysis with Apache License 2.0

5 votes

def recursiveFeatureSelectorCV(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFECV(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data

#Iterating over all feature preprocessors and classifiers in turn

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

5 votes

def plotRFECV (X,y,stepSize=0.05,scoring='f1'):
    '''
    Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation.
    http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py
    '''
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV

    # Create the RFE object and compute a cross-validated score.
    # svc = SVC(kernel="linear")
    svc = SVC(kernel="linear",class_weight='auto', cache_size=1400)
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2),
                  scoring=scoring)
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv

Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0

5 votes

def plot_RFE(X,y):
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.datasets import make_classification
    from sklearn.metrics import zero_one_loss
    import pylab as pl
    import matplotlib.pylab as pl

    # Create the RFE object and compute a cross-validated score.
    # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True)
    svc=LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto',multi_class='ovr')
#    SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15)
##    rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc')
    rfecv = RFECV(estimator=svc, step=0.2,cv=StratifiedKFold(y, 2), scoring='f1')
    X_RFE = rfecv.fit_transform(X, y)

    print("Optimal number of features in X_RFE : %d" % rfecv.n_features_)
    # Plot number of features VS. cross-validation scores
    pl.figure()
    pl.xlabel("Number of features selected")
    pl.ylabel("Cross validation score (nb of misclassifications)")
    pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    pl.show()
    print ('RFE Opt.shapes features CV score:')
    CV_multi_stats(X_RFE,y,svc)
    return (X_RFE,rfecv)

Source File: feature_selection_insight.py From karura with Apache License 2.0

4 votes

def adopt(self, dfe, interpreted=None):
        models = []
        # about scoring, please see following document
        # http://scikit-learn.org/stable/modules/model_evaluation.html#common-cases-predefined-values
        scoring = "accuracy"

        # todo: now, text and datetime colum is ignored
        for t in (FType.text, FType.datetime):
            columns = dfe.get_columns(t, include_target=False)
            dfe.df.drop(columns, inplace=True, axis=1)
            dfe.sync()

        if dfe.get_target_ftype() == FType.categorical:
            #models = [RandomForestClassifier(), SVC(kernel="linear")]
            models = [RandomForestClassifier()]
            if self.is_binary_classification(dfe):
                scoring = "f1"
            else:
                # see reference about f1 score
                # http://scikit-learn.org/stable/modules/generated/sklearn.metrics.f1_score.html#sklearn.metrics.f1_score
                scoring = "f1_micro"  # if prediction does not occur to some label, macro is too worse to evaluate
        elif dfe.get_target_ftype() == FType.numerical:
            # About the model to select the feature, please refer
            # http://scikit-learn.org/stable/modules/feature_selection.html
            models = [Lasso(alpha=.1), RandomForestRegressor()]
            scoring = "r2"
        else:
            raise Exception("Target type is None or un-predictable type.")
        
        features = dfe.get_features()
        target = dfe.get_target()
        best_rfecv = None
        feature_masks = []
        for m in models:
            rfecv = RFECV(estimator=m, step=1, cv=self.cv_count, scoring=scoring, n_jobs=self.n_jobs)
            rfecv.fit(features, target)
            feature_masks.append(rfecv.support_)
        
        selected_mask = []
        if len(feature_masks) < 2:
            selected_mask = feature_masks[0]
        else:
            selected_mask = np.logical_and(*feature_masks)  # take the feature that some models take

        eliminates = features.columns[np.logical_not(selected_mask)]
        dfe.df.drop(eliminates, inplace=True, axis=1)
        dfe.sync()

        selected = features.columns[selected_mask].tolist()

        ss = self.a2t(selected)
        self.description = {
                "ja": "項目{}は予測に有効な項目です。これらを利用し、モデルを構築します。".format(ss),
                "en": "Columns {} are useful to predict. I'll use these to make model.".format(ss)
            }
        return True

Source File: genericmodelclass.py From easyML with BSD 3-Clause "New" or "Revised" License

4 votes

def recursive_feature_elimination_cv(self, step=1, inplace=False):
        """A method to implement recursive feature elimination on the model 
        with cross-validation(CV). At each step, features are ranked as per 
        the algorithm used and lowest ranked features are removed,
        as specified by the step argument. At each step, the CV score is 
        determined using the scoring metric specified in the model. The set 
        of features with highest cross validation scores is then chosen. 

        Parameters
        __________
        step : int or float, default=1
            If int, then step corresponds to the number of features to remove
            at each iteration. 
            If float and within (0.0, 1.0), then step corresponds to the 
            percentage (rounded down) of features to remove at each 
            iteration.
            If float and greater than one, then integral part will be
            considered as an integer input
            
        inplace : bool, default=False
            If True, the predictors of the class are modified to those 
            selected by the RFECV procedure.

        Returns
        _______
        selected : pandas series
            A series object containing the selected features as 
            index and their rank in selection as values
        """
        rfecv = RFECV(
                self.alg, step=step,cv=self.cv_folds,
                scoring=self.scoring_metric,n_jobs=-1
                )
        
        rfecv.fit(
                self.datablock.train[self.predictors], 
                self.datablock.train[self.datablock.target]
                )

        if step>1:
            min_nfeat = (len(self.predictors) 
                        - step*(len(rfecv.grid_scores_)-1))

            plt.xlabel("Number of features selected")
            plt.ylabel("Cross validation score")
            plt.plot(
                    range(min_nfeat, len(self.predictors)+1, step), 
                    rfecv.grid_scores_
                    )
            plt.show(block=False)

        ranks = pd.Series(rfecv.ranking_, index=self.predictors)
        selected = ranks.loc[rfecv.support_]

        if inplace:
            self.set_predictors(selected.index.tolist())
        return ranks

Python sklearn.feature_selection.RFECV Examples