Python Examples of sklearn.feature

Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0

7 votes

def ReducedFeaturesDF(X,y):
        '''
        Returns a dataframe with only a subset of features/columns retained
        '''
        from sklearn.feature_selection import RFE
        est = LinearSVC( penalty='l1', loss='l2', dual=False, class_weight='auto')
#        selectK = SelectKBest(score_func = f_classif, k=45)
        selectRFE = RFE(estimator=est, n_features_to_select=22, step=0.15)
        selectK=selectRFE

        selectK.fit(X,y)
        selectK_mask=selectK.get_support()
        K_featnames = feature_names[selectK_mask]
        print ("reduced RFE features:")
        print(K_featnames)
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
#        Reduced_df.to_csv('REDUCED_Feat.csv')
        return Reduced_df

#    ReducedFeaturesDF(X,y)
    # z=pd.DataFrame(data=X_SGD,index=y)
    # z.to_csv('REDUCED_Feat.csv')

Source File: train.py From 3D-Human-Body-Shape with MIT License

6 votes

def rfe_multiprocess(i, dets, deform, body_num, x, measure, k_features):
  sys.stdout.write('>> calc rfe map NO.%d\n'%(i))
  y = np.array(dets).reshape(body_num, 1)
  model = LinearRegression()
  # recurcive feature elimination
  rfe = RFE(model, k_features)
  rfe.fit(x, y.ravel())
  # mask.append(rfe.support_)
  flag = np.array(rfe.support_).reshape(utils.M_NUM, 1)
  flag = flag.repeat(body_num, axis=1)

  # calculte linear mapping mat
  S = np.array(deform)
  S.shape = (S.size, 1)
  m = np.array(measure[flag])
  m.shape = (k_features, body_num)
  M = build_equation(m, 9)
  MtM = M.transpose().dot(M)
  MtS = M.transpose().dot(S)
  ans = np.array(scipy.sparse.linalg.spsolve(MtM, MtS))
  ans.shape = (9, k_features)
  return [ans, rfe.support_]

Source File: FeatureSelector.py From FAE with GNU General Public License v3.0

6 votes

def GetSelectedFeatureIndex(self, data_container):
        data = data_container.GetArray()
        data /= np.linalg.norm(data, ord=2, axis=0)
        label = data_container.GetLabel()

        if data.shape[1] < self.GetSelectedFeatureNumber():
            print('RFE: The number of features {:d} in data container is smaller than the required number {:d}'.format(
                data.shape[1], self.GetSelectedFeatureNumber()))
            self.SetSelectedFeatureNumber(data.shape[1])

        fs = RFE(self.__classifier, self.GetSelectedFeatureNumber(), step=0.05)
        fs.fit(data, label)
        feature_index = fs.get_support(True)
        self._rank = fs.ranking_

        return feature_index.tolist()

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

6 votes

def GetKFeatures(filename, method='RFE',kbest=30,alpha=0.01, reduceMatrix = True):
    '''
    Gets best features using chosen method
    (K-best, RFE, RFECV,'L1' (RandomizedLogisticRegression),'Tree' (ExtraTreesClassifier), mrmr),
    then prints top K features' names (from featNames).
    If reduceMatrix =  True, then also returns X reduced to the K best features.

    Available methods' names are: 'RFE','RFECV','RandomizedLogisticRegression','K-best','ExtraTreesClassifier'..
    Note, that effectiveyl, Any scikit learn method could be used, if correctly imported..
    '''
    #est = method()
    '''
    Gets the K-best features (filtered by FDR, then select best ranked by t-test , more advanced options can be implemented).
    Save the data/matrix with the resulting/kept features to a new output file, "REDUCED_Feat.csv"
    '''
    features, labels, lb_encoder,featureNames = load_data(filename)
    X, y = features, labels

    # change the names as ints back to strings
    class_names=lb_encoder.inverse_transform(y)
    print("Data and labels imported. PreFilter Feature matrix shape:")
    print(X.shape)

    selectK = SelectKBest(k=kbest)
    selectK.fit(X,y)
    selectK_mask=selectK.get_support()
    K_featnames = featureNames[selectK_mask]
    print('X After K filter:',X.shape)
    print("K_featnames: %s" %(K_featnames))
    if reduceMatrix ==True :
        Reduced_df = pd.read_csv(filename, index_col=0)
        Reduced_df = Reduced_df[Reduced_df.columns[selectK_mask]]
        Reduced_df.to_csv('REDUCED_Feat.csv')
        print('Saved to REDUCED_Feat.csv')
        return Reduced_df

#WORKS! But unreadable with too many features!

Source File: test_feature_selection.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.feature_selection.GenericUnivariateSelect,
                      fs.GenericUnivariateSelect)
        self.assertIs(df.feature_selection.SelectPercentile,
                      fs.SelectPercentile)
        self.assertIs(df.feature_selection.SelectKBest, fs.SelectKBest)
        self.assertIs(df.feature_selection.SelectFpr, fs.SelectFpr)
        self.assertIs(df.feature_selection.SelectFromModel,
                      fs.SelectFromModel)
        self.assertIs(df.feature_selection.SelectFdr, fs.SelectFdr)
        self.assertIs(df.feature_selection.SelectFwe, fs.SelectFwe)
        self.assertIs(df.feature_selection.RFE, fs.RFE)
        self.assertIs(df.feature_selection.RFECV, fs.RFECV)
        self.assertIs(df.feature_selection.VarianceThreshold,
                      fs.VarianceThreshold)

Source File: feature_selection.py From default-credit-card-prediction with MIT License

6 votes

def rfe_selection(X,y,n_features):
	"""
	Performs the Recursive Feature Elimination method and selects the top ranking features

	Keyword arguments:
	X -- The feature vectors
	y -- The target vector
	n_features -- n best ranked features
	"""

	if verbose:
		print '\nPerforming Feature Selection based on the Recursive Feature Elimination method ...'

	clf=RandomForestClassifierWithCoef(n_estimators=10,n_jobs=-1)
	fs= RFE(clf, n_features, step=1)
	fs= fs.fit(X,y)
	ranks=fs.ranking_

	feature_indexes=[]
	for i in xrange(len(ranks)):
		if ranks[i]==1:
			feature_indexes+=[i]

	return X[:,feature_indexes[0:n_features]],feature_indexes[0:n_features]		#return selected features and original index features

Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0

5 votes

def compute_ranks(self):
        if self._algorithm == FeatureSelector.SELECT_K_BEST:
            scores = self._selector.scores_
            sorted_scores = sorted(scores, reverse=True)
            ranks = [sorted_scores.index(i) + 1 for i in scores]
        elif self._algorithm == FeatureSelector.SELECT_PERCENTILE:
            scores = self._selector.scores_
            sorted_scores = sorted(scores, reverse=True)
            ranks = [sorted_scores.index(i) + 1 for i in scores]
        elif self._algorithm == FeatureSelector.RECURSIVE_ELIMINATION:
            n_selected = self._selector.n_features_
            support = self._selector.support_
            ranking = self._selector.ranking_
            # RFE and RFECV do not provide feature scores. Instead, they
            # provide a list of features which have been selected (support)
            # and an ascending list indicating when each other feature was
            # eliminated. Use these two to construct feature ranks, though
            # acknowledge that RFE and RFECV do not actually distinguish between
            # the weights of selected features.
            ranks = [0]*len(support)
            selected_count = 0
            for i in range(len(ranking)):
                if support[i]:
                    # All selected features in ranking receive rank 1, so need
                    # to iterate through list and add incrementing values so
                    # that features ranked 1, 1, 1, become 1, 2, 3.
                    ranks[i] = ranking[i] + selected_count
                    selected_count += 1
                else:
                    # Even if there are 5 selected features, the 6th feature
                    # in ranking is given rank 2, so add (n_selected - 1).
                    ranks[i] = ranking[i] + (n_selected - 1)

        return ranks

Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0

5 votes

def plot_RFE(X,y):
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV
    from sklearn.datasets import make_classification
    from sklearn.metrics import zero_one_loss
    import pylab as pl
    import matplotlib.pylab as pl

    # Create the RFE object and compute a cross-validated score.
    # svc= SVC(kernel="linear", class_weight="auto", cache_size=1200, shrinking=True)
    svc=LinearSVC(penalty='l1', loss='l2', dual=False, class_weight='auto',multi_class='ovr')
#    SGD = SGDClassifier(penalty='elasticnet',class_weight='auto',n_jobs=-1,n_iter=10,l1_ratio =0.15)
##    rfecv = RFECV(estimator=svc, step=0.1, cv=StratifiedKFold(y, 5), scoring='roc_auc')
    rfecv = RFECV(estimator=svc, step=0.2,cv=StratifiedKFold(y, 2), scoring='f1')
    X_RFE = rfecv.fit_transform(X, y)

    print("Optimal number of features in X_RFE : %d" % rfecv.n_features_)
    # Plot number of features VS. cross-validation scores
    pl.figure()
    pl.xlabel("Number of features selected")
    pl.ylabel("Cross validation score (nb of misclassifications)")
    pl.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    pl.show()
    print ('RFE Opt.shapes features CV score:')
    CV_multi_stats(X_RFE,y,svc)
    return (X_RFE,rfecv)

Source File: PipeTasks.py From ProFET with GNU General Public License v3.0

5 votes

def plotRFECV (X,y,stepSize=0.05,scoring='f1'):
    '''
    Plot recursive feature elimination example with automatic tuning of the number of features selected with cross-validation.
    http://scikit-learn.org/stable/auto_examples/plot_rfe_with_cross_validation.html#example-plot-rfe-with-cross-validation-py
    '''
    from sklearn.svm import SVC
    from sklearn.cross_validation import StratifiedKFold
    from sklearn.feature_selection import RFECV

    # Create the RFE object and compute a cross-validated score.
    # svc = SVC(kernel="linear")
    svc = SVC(kernel="linear",class_weight='auto', cache_size=1400)
    # The "accuracy" scoring is proportional to the number of correct
    # classifications
    rfecv = RFECV(estimator=svc, step=stepSize, cv=StratifiedKFold(y, 2),
                  scoring=scoring)
    rfecv.fit(X, y)

    print("Optimal number of features : %d" % rfecv.n_features_)

    # Plot number of features VS. cross-validation scores
    import matplotlib.pyplot as plt
    plt.figure()
    plt.xlabel("Number of features selected")
    plt.ylabel("Cross validation score (nb of correct classifications)")
    plt.plot(range(1, len(rfecv.grid_scores_) + 1), rfecv.grid_scores_)
    plt.show()

    return rfecv

Source File: wine.py From UCI-Data-Analysis with Apache License 2.0

5 votes

def recursiveFeatureSelector(classifier_model,train_data,train_labels,test_data,number_of_features):
    
    rfe = RFE(classifier_model,number_of_features)
    transformed_train_data = rfe.fit_transform(train_data,train_labels)
    transformed_test_data = rfe.transform(test_data)
    
    return transformed_train_data,transformed_test_data 
    
#Defines the recursive feature selector for choosing the best feature using Cross Validation

Source File: FeatureSelector.py From CDSS with GNU General Public License v3.0

5 votes

def _eliminate_recursively(self, k=None):
        if self._problem == FeatureSelector.CLASSIFICATION:
            estimator = RandomForestClassifier(random_state=self._random_state)
        else:
            estimator = LassoCV(random_state=self._random_state)
        # If k is not specified, then use RFECV to automatically decide on
        # optimal number of features. If specified, then use RFE.
        if k is None:
            self._selector = RFECV(estimator)
        else:
            self._selector = RFE(estimator, n_features_to_select=k, step=0.05)

Source File: FeatureSelector.py From FAE with GNU General Public License v3.0

5 votes

def __init__(self, selected_feature_number=1, classifier=SVC(kernel='linear')):
        super(FeatureSelectByRFE, self).__init__(name='RFE', selected_feature_number=selected_feature_number)
        self.__classifier = classifier
        self._rank = None
        self._selected_features = []

Source File: classifiers.py From Sarcasm-Detection with MIT License

5 votes

def feature_selection(x_train, y_train, x_test, y_test):
    print("Feature selection with LinearSVC")
    model = LinearSVC(C=0.1, penalty='l2')
    rfe = RFE(model, 5)
    best_features_model = rfe.fit(x_train, y_train)
    y_hat = best_features_model.predict(x_test)
    utils.print_statistics(y_test, y_hat)

Source File: rfe.py From lale with Apache License 2.0

5 votes

def __init__(self, estimator, n_features_to_select=None, step=1, verbose=0):
        self._hyperparams = {
            'estimator': estimator,
            'n_features_to_select': n_features_to_select,
            'step': step,
            'verbose': verbose}
        self._wrapped_model = SKLModel(**self._hyperparams)

Source File: ABIDEParser.py From population-gcn with GNU General Public License v3.0

5 votes

def feature_selection(matrix, labels, train_ind, fnum):
    """
        matrix       : feature matrix (num_subjects x num_features)
        labels       : ground truth labels (num_subjects x 1)
        train_ind    : indices of the training samples
        fnum         : size of the feature vector after feature selection

    return:
        x_data      : feature matrix of lower dimension (num_subjects x fnum)
    """

    estimator = RidgeClassifier()
    selector = RFE(estimator, fnum, step=100, verbose=1)

    featureX = matrix[train_ind, :]
    featureY = labels[train_ind]
    selector = selector.fit(featureX, featureY.ravel())
    x_data = selector.transform(matrix)

    print("Number of labeled samples %d" % len(train_ind))
    print("Number of features selected %d" % x_data.shape[1])

    return x_data


# Make sure each site is represented in the training set when selecting a subset of the training set

Source File: modeling.py From kddcup2015 with GNU General Public License v2.0

5 votes

def lr_with_fs():
    """
    Submission: lr_with_fs_0620_02.csv
    E_val: <missing>
    E_in: 0.856252488379
    E_out: 0.8552577388980213
    """
    from sklearn.linear_model import LogisticRegressionCV
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    clf = LogisticRegressionCV(cv=10, scoring='roc_auc', n_jobs=-1)
    clf.fit(X_new, y)
    print(auc_score(clf, X_new, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('lr', clf)]), 'lr_with_fs_0620_02')

Source File: model_recommendation.py From DIVE-backend with GNU General Public License v3.0

5 votes

def recursive_feature_elimination(df, dependent_variable, independent_variables, interaction_terms=[], model_limit=5):
    considered_independent_variables_per_model, patsy_models = \
        construct_models(df, dependent_variable, independent_variables, interaction_terms, table_layout=MCT.ALL_VARIABLES.value)
    y, X = dmatrices(patsy_models[0], df, return_type='dataframe')

    estimator = SVR(kernel='linear')
    selector = RFE(estimator, 5, step=1)
    selector = selector.fit(X, y)
    logger.info(selector.support_)
    logger.info(selector.ranking_)
    return

Source File: model_recommendation.py From DIVE-backend with GNU General Public License v3.0

5 votes

def get_initial_regression_model_recommendation(project_id, dataset_id, dependent_variable_id=None, recommendation_type=MRT.LASSO.value, table_layout=MCT.LEAVE_ONE_OUT.value, data_size_cutoff=current_app.config['ANALYSIS_DATA_SIZE_CUTOFF'], categorical_value_limit=current_app.config['ANALYSIS_CATEGORICAL_VALUE_LIMIT']):
    df = get_data(project_id=project_id, dataset_id=dataset_id)
    if len(df) > data_size_cutoff:
        df = df.sample(data_size_cutoff)
    field_properties = db_access.get_field_properties(project_id, dataset_id)
    quantitative_field_properties = [ fp for fp in field_properties if fp['general_type'] == 'q']

    dependent_variable = next((f for f in field_properties if f['id'] == dependent_variable_id), None) \
        if dependent_variable_id \
        else np.random.choice(quantitative_field_properties, size=1)[0]

    independent_variables = []
    for fp in field_properties:
        if (fp['name'] != dependent_variable['name']):
            if (fp['general_type'] == 'c' and (fp['is_unique'] or len(fp['unique_values']) > categorical_value_limit)):
                continue
            independent_variables.append(fp)

    recommendationTypeToFunction = {
        MRT.FORWARD_R2.value: forward_r2,
        MRT.LASSO.value: lasso,
        MRT.RFE.value: recursive_feature_elimination,
        MRT.FORWARD_F.value: f_regression
    }

    result = recommendationTypeToFunction[recommendation_type](df, dependent_variable, independent_variables)

    return {
        'recommended': True,
        'table_layout': table_layout,
        'recommendation_type': recommendation_type,
        'dependent_variable_id': dependent_variable['id'],
        'independent_variables_ids': [ x['id'] for x in result ],
    }

Source File: FeatureSelector.py From FAE with GNU General Public License v3.0

5 votes

def GetDescription(self):
        text = "Before build the model, we used recursive feature elimination (RFE) to select features. The goal of RFE " \
               "is to select features based on a classifier by recursively considering smaller set of the features. "
        return text

Source File: c10.py From abu with GNU General Public License v3.0

4 votes

def sample_1033_2():
    """
    10.3.3 特征的重要性排序及支持度评级
    :return:
    """
    global g_with_date_week_noise
    g_with_date_week_noise = True
    train_x, train_y_regress, train_y_classification, pig_three_feature, \
    test_x, test_y_regress, test_y_classification, kl_another_word_feature_test = sample_1031_1()

    # noinspection PyShadowingNames
    def importances_coef_pd(estimator):
        """
            特征的重要性
        """
        if hasattr(estimator, 'feature_importances_'):
            # 有feature_importances_的通过sort_values排序
            return pd.DataFrame(
                {'feature': list(pig_three_feature.columns[1:]),
                 'importance': estimator.feature_importances_}).sort_values('importance')

        elif hasattr(estimator, 'coef_'):
            # 有coef_的通过coef排序
            return pd.DataFrame(
                {"columns": list(pig_three_feature.columns)[1:], "coef": list(estimator.coef_.T)}).sort_values('coef')
        else:
            print('estimator not hasattr feature_importances_ or coef_!')

    # 使用随机森林分类器
    from sklearn.ensemble import RandomForestClassifier
    estimator = RandomForestClassifier(n_estimators=100)
    # 训练数据模型
    estimator.fit(train_x, train_y_classification)
    # 对训练后的模型特征的重要度进行判定，重要程度由小到大，表10-4所示
    print('importances_coef_pd(estimator):\n', importances_coef_pd(estimator))

    from sklearn.feature_selection import RFE

    # noinspection PyShadowingNames
    def feature_selection(estimator, x, y):
        """
            支持度评级
        """
        selector = RFE(estimator)
        selector.fit(x, y)
        print('RFE selection')
        print(pd.DataFrame(
            {'support': selector.support_, 'ranking': selector.ranking_},
            index=pig_three_feature.columns[1:]))

    print('feature_selection(estimator, train_x, train_y_classification):\n',
          feature_selection(estimator, train_x, train_y_classification))

Source File: modeling.py From kddcup2015 with GNU General Public License v2.0

4 votes

def sgd():
    """
    Submission: sgd_0620_03.csv
    E_val: 0.863628
    E_in: 0.854373
    E_out:
    """
    from sklearn.linear_model import SGDClassifier
    from sklearn.preprocessing import StandardScaler
    from sklearn.pipeline import Pipeline
    from sklearn.grid_search import GridSearchCV
    from sklearn.cross_validation import StratifiedKFold

    X = util.fetch(util.cache_path('train_X_before_2014-08-01_22-00-47'))
    y = util.fetch(util.cache_path('train_y_before_2014-08-01_22-00-47'))

    raw_scaler = StandardScaler()
    raw_scaler.fit(X)
    X_scaled = raw_scaler.transform(X)

    rfe = util.fetch(util.cache_path('feature_selection.RFE.21'))

    X_pruned = rfe.transform(X_scaled)

    new_scaler = StandardScaler()
    new_scaler.fit(X_pruned)
    X_new = new_scaler.transform(X_pruned)

    sgd = SGDClassifier(n_iter=50, n_jobs=-1)
    params = {
        'loss': ['hinge', 'log', 'modified_huber', 'squared_hinge',
                 'perceptron', 'squared_loss', 'huber', 'epsilon_insensitive',
                 'squared_epsilon_insensitive']
    }
    grid = GridSearchCV(sgd, param_grid=params, cv=StratifiedKFold(y, 5),
                        scoring='roc_auc', n_jobs=-1)
    grid.fit(X_new, y)

    logger.debug('Best score (E_val): %f', grid.best_score_)

    sgd = grid.best_estimator_

    logger.debug('E_in: %f', auc_score(sgd, X_new, y))
    to_submission(Pipeline([('scale_raw', raw_scaler),
                            ('rfe', rfe),
                            ('scale_new', new_scaler),
                            ('sgd', sgd)]), 'sgd_0620_03')

Source File: genericmodelclass.py From easyML with BSD 3-Clause "New" or "Revised" License

4 votes

def recursive_feature_elimination(self, nfeat=None, step=1, inplace=False):

        """A method to implement recursive feature elimination on the model.
        Note that CV is not performed in this function. The method will 
        continue to eliminate some features (specified by step parameter)
        at each step until the specified number of features are reached.

        Parameters
        __________
        nfeat : int or None, default=None
            The num of top features to select. If None, half of the features 
            are selected.

        step : int or float, default=1
            If int, then step corresponds to the number of features to remove
            at each iteration. 
            If float and within (0.0, 1.0), then step corresponds to the 
            percentage (rounded down) of features to remove at each 
            iteration.
            If float and greater than one, then integral part will be
            considered as an integer input
            
        inplace : bool, default=False
            If True, the predictors of the class are modified to those 
            selected by the RFE procedure.

        Returns
        _______
        selected : A series object containing the selected features as 
        index and their rank in selection as values
        """
        rfe = RFE(self.alg, n_features_to_select=nfeat, step=step)
        
        rfe.fit(
                self.datablock.train[self.predictors], 
                self.datablock.train[self.datablock.target]
                )
        
        ranks = pd.Series(rfe.ranking_, index=self.predictors)
        
        selected = ranks.loc[rfe.support_]

        if inplace:
            self.set_predictors(selected.index.tolist())
        
        return selected

Python sklearn.feature_selection.RFE Examples