Python Examples of sklearn.ensemble.RandomForestClassifier

Source File: mmbot.py From MaliciousMacroBot with MIT License

8 votes

def mmb_evaluate_model(self):
        """
        Returns scores from cross validation evaluation on the malicious / benign classifier
        """
        predictive_features = self.features['predictive_features']
        self.clf_X = self.modeldata[predictive_features].values
        self.clf_y = np.array(self.modeldata['label'])

        X_train, X_test, y_train, y_test = train_test_split(self.clf_X, self.clf_y, test_size=0.2, random_state=0)
        lb = LabelBinarizer()
        y_train = np.array([number[0] for number in lb.fit_transform(y_train)])
        eval_cls = RandomForestClassifier(n_estimators=100, max_features=.2)
        eval_cls.fit(X_train, y_train)

        recall = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='recall')
        precision = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='precision')
        accuracy = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='accuracy')
        f1_score = cross_val_score(eval_cls, X_train, y_train, cv=5, scoring='f1_macro')

        return {'accuracy': accuracy, 'f1': f1_score, 'precision': precision, 'recall': recall}

Source File: forest.py From cgpm with Apache License 2.0

6 votes

def __init__(self, outputs, inputs, k=None, hypers=None, params=None,
            distargs=None, rng=None):
        self.rng = gu.gen_rng() if rng is None else rng
        self.outputs = outputs
        self.inputs = inputs
        self.rng = gu.gen_rng() if rng is None else rng
        assert len(self.outputs) == 1
        assert len(self.inputs) >= 1
        assert self.outputs[0] not in self.inputs
        assert len(distargs['inputs']['stattypes']) == len(self.inputs)
        self.stattypes = distargs['inputs']['stattypes']
        # Number of output categories and input dimension.
        # XXX WHATTA HACK. BayesDB passes in top-level kwargs, not in distargs.
        self.k = k if k is not None else int(distargs['k'])
        self.p = len(distargs['inputs']['stattypes'])
        # Sufficient statistics.
        self.N = 0
        self.data = Data(x=OrderedDict(), Y=OrderedDict())
        self.counts = [0] * self.k
        # Outlier and random forest parameters.
        if params is None: params = {}
        self.alpha = params.get('alpha', .1)
        self.regressor = params.get('forest', None)
        if self.regressor is None:
            self.regressor = RandomForestClassifier(random_state=self.rng)

Source File: classifier.py From stock-price-prediction with MIT License

6 votes

def buildModel(dataset, method, parameters):
    """
    Build final model for predicting real testing data
    """
    features = dataset.columns[0:-1]

    if method == 'RNN':
        clf = performRNNlass(dataset[features], dataset['UpDown'])
        return clf

    elif method == 'RF':
        clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)

    elif method == 'KNN':
        clf = neighbors.KNeighborsClassifier()

    elif method == 'SVM':
        c = parameters[0]
        g =  parameters[1]
        clf = SVC(C=c, gamma=g)

    elif method == 'ADA':
        clf = AdaBoostClassifier()

    return clf.fit(dataset[features], dataset['UpDown'])

Source File: 03_fit_predict_plot_midwest_survey.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

6 votes

def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
    # adding the encoded column
    transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', RandomForestClassifier(random_state=5))
    ])

    return pipeline


###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score:

Source File: vanilla_model.py From OpenChem with MIT License

6 votes

def __init__(self, model_type='classifier', feature_type='fingerprints',
                 n_estimators=100, n_ensemble=5):
        super(RandomForestQSAR, self).__init__()
        self.n_estimators = n_estimators
        self.n_ensemble = n_ensemble
        self.model = []
        self.model_type = model_type
        if self.model_type == 'classifier':
            for i in range(n_ensemble):
                self.model.append(RFC(n_estimators=n_estimators))
        elif self.model_type == 'regressor':
            for i in range(n_ensemble):
                self.model.append(RFR(n_estimators=n_estimators))
        else:
            raise ValueError('invalid value for argument')
        self.feature_type = feature_type
        if self.feature_type == 'descriptors':
            self.calc = Calculator(descriptors, ignore_3D=True)
            self.desc_mean = [0]*self.n_ensemble

Source File: adult_RF_Classify.py From Machine-Learning-for-Beginner-by-Python3 with MIT License

6 votes

def Train(data, treecount, tezh, yanzhgdata):
    model = RFC(n_estimators=treecount, max_features=tezh, class_weight='balanced')
    model.fit(data[:, :-1], data[:, -1])
    # 给出训练数据的预测值
    train_out = model.predict(data[:, :-1])
    # 计算MSE
    train_mse = fmse(data[:, -1], train_out)[0]

    # 给出验证数据的预测值
    add_yan = model.predict(yanzhgdata[:, :-1])
    # 计算f1度量
    add_mse = fmse(yanzhgdata[:, -1], add_yan)[0]
    print(train_mse, add_mse)
    return train_mse, add_mse

# 最终确定组合的函数

Source File: test_train_pairwise_similarity_model.py From redshells with MIT License

6 votes

def test_run(self):
        self.input_data['item2embedding'] = dict(i0=[1, 2], i1=[3, 4])
        self.input_data['similarity_data'] = pd.DataFrame(
            dict(item1=['i0', 'i0', 'i1'], item2=['i0', 'i1', 'i1'], similarity=[1, 0, 1]))

        task = TrainPairwiseSimilarityModel(
            item2embedding_task=_DummyTask(),
            similarity_data_task=_DummyTask(),
            model_name='RandomForestClassifier',
            item0_column_name='item1',
            item1_column_name='item2',
            similarity_column_name='similarity')
        task.load = MagicMock(side_effect=self._load)
        task.dump = MagicMock(side_effect=self._dump)

        task.run()
        self.assertIsInstance(self.dump_data, RandomForestClassifier)

Source File: function.py From Karta with MIT License

6 votes

def trainFunctionTypeClassifier(self, scs):
        """Train the type classifier, according to all known code segments.

        Args:
            scs (list): list of all known (sark) code segments

        Note:
            Training must happen *after* the calibration phase
        """
        functions = []
        for sc in scs:
            functions += list(filter(lambda func: not self._analyzer.fptr_identifier.isPointedFunction(func.start_ea), sc.functions))
        clf = RandomForestClassifier(n_estimators=100)
        eas = list(map(lambda x: x.start_ea, functions))
        data_set = list(map(self.extractFunctionTypeSample, eas))
        data_results = list(map(self._analyzer.codeType, eas))
        # classify
        clf.fit(data_set, data_results)
        # store the results
        self._type_classifier = clf

Source File: Stock_Prediction_Model_Random_Forrest.py From StockRecommendSystem with MIT License

6 votes

def build_model(self, X_train, y_train):
        if self.paras.load == True:
            model = self.load_training_model(self.paras.window_len)
            if model != None:
                return model

        print('build Random Forrest model...')

        # range of number of trees : 5*(1 -> 10) = 5,10,...,50 trees
        t_min = self.paras.tree_min[index]
        t_max = self.paras.tree_max[index]
        # range of max of features : 1 -> 10 features
        f_min = self.paras.feature_min[index]
        f_max = self.paras.feature_max[index]
        # range of window : 1 -> 70 days 
        w_min = self.paras.window_min
        w_max = self.paras.window_max
        
        w_opt, n_opt, m_opt = self.best_window(X_train, y_train, w_min,w_max,t_min,t_max,f_min,f_max)
        model = RandomForestClassifier(n_estimators=n_opt,max_features=m_opt, n_jobs=8, verbose=self.paras.verbose)
        return model

Source File: test_random_forest_classifier.py From monasca-analytics with Apache License 2.0

5 votes

def test_learn_structure(self):
        data = self.get_testing_data()
        clf = self.rf_sml.learn_structure(data)
        self.assertIsInstance(clf, ensemble.RandomForestClassifier)

Source File: Blending_Classify_adult.py From Machine-Learning-for-Beginner-by-Python3 with MIT License

5 votes

def RF_First(self, data, n_estimators=800, max_features='sqrt'):
        # 对训练数据进行训练，返回模验证数据，预测数据的预测结果
        model = RF(n_estimators=n_estimators, max_features=max_features)
        model.fit(data['train'][:, :-1], data['train'][:, -1])
        # 存储验证数据集结果和预测数据集结果
        # 训练数据集的预测结果
        xul = model.predict(data['train'][:, :-1])
        # 验证的预测结果
        yanre = model.predict(data['test'][:, :-1])
        # 预测的预测结果
        prer = model.predict(data['predict'][:, :-1])

        # 每计算一折后，要计算训练、验证、预测数据的误差
        xx = self.F1(xul, data['train'][:, -1])

        yy = self.F1(yanre, data['test'][:, -1])

        pp = self.F1(prer, data['predict'][:, -1])

        # 开始结合
        self.yanzhneg_pr.append(yanre)
        self.yanzhneg_real = data['test'][:, -1]
        self.predi.append(prer)
        self.preal = data['predict'][:, -1]

        # 存储误差
        self.error_dict['随机森林'] = [xx, yy, pp]
        return print('1层中的随机森林运行完毕')

    # AdaBoost

Source File: classification_randomForest.py From practicalDataAnalysisCookbook with GNU General Public License v2.0

5 votes

def fitRandomForest(data):
    '''
        Build a random forest classifier
    '''
    # create the classifier object
    forest = en.RandomForestClassifier(n_jobs=-1, 
        min_samples_split=100, n_estimators=10, 
        class_weight="auto")

    # fit the data
    return forest.fit(data[0],data[1])

# the file name of the dataset

Source File: mlmodel.py From speech-emotion-recognition with MIT License

5 votes

def __init__(self, **params):
        params['name'] = 'Random Forest'
        super(RF, self).__init__(**params)
        self.model = RandomForestClassifier(n_estimators=30)

Source File: test_pipe.py From skutil with BSD 3-Clause "New" or "Revised" License

5 votes

def test_pipeline_complex():
    pipe = Pipeline([
        ('selector',  FeatureRetainer(cols=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)'])),
        ('scaler',    SelectiveScaler()),
        ('boxcox',    BoxCoxTransformer()),
        ('pca',       SelectivePCA()),
        ('svd',       SelectiveTruncatedSVD()),
        ('model',     RandomForestClassifier())
    ])

    pipe.fit(X, iris.target)

Source File: test_random_forest_classifier.py From monasca-analytics with Apache License 2.0

5 votes

def setUp(self):
        super(TestRandomForestClassifier, self).setUp()
        self.rf_sml = random_forest_classifier.RandomForestClassifier(
            "fakeid", {"module": "fake", "nb_samples": 1000})

Source File: random_forest_classifier.py From monasca-analytics with Apache License 2.0

5 votes

def _get_best_detector(self, train, label):
        detector = ensemble.RandomForestClassifier()
        detector.fit(train, label)
        return detector

Source File: random_forest_classifier.py From monasca-analytics with Apache License 2.0

5 votes

def __init__(self, _id, _config):
        super(RandomForestClassifier, self).__init__(_id, _config)
        self._nb_samples = int(_config['nb_samples'])

Source File: models.py From aletheia with MIT License

5 votes

def fit(self, X, y):
        
        self.selector = SelectKBest(f_classif, k=self.max_features)
        self.selector.fit(X, y)

        X_train=self.selector.transform(X)
        y_train=y

        param_list=[]
        idx = range(len(y_train))
        for i in range(self.n_estimators):
            random.shuffle(idx)
            param_list.append((X_train[idx[:self.max_samples]], 
                               y_train[idx[:self.max_samples]]))

        pool = ThreadPool(cpu_count())
        self.clf_list = pool.map(self._prepare_classifier, param_list)
        pool.close()
        pool.join()

        """
        X2=[]
        for clf in self.clf_list:
            P=clf.predict_proba(X_train)
            if len(X2)==0:
                X2=P[:, 0]
            else:
                X2=numpy.vstack((X2, P[:, 0]))
        X2=numpy.swapaxes(X2, 0, 1)
        print "X2:", X2.shape

        from sklearn.ensemble import RandomForestClassifier
        self.clf2=RandomForestClassifier(n_estimators=100)
        self.clf2.fit(X2, y_train)
        """

Source File: pipline.py From MachineLearning with Apache License 2.0

5 votes

def get_rfc():
    return RandomForestClassifier(
        n_estimators=100,
        max_features=0.5,
        max_depth=None,
        max_leaf_nodes=270,
        min_impurity_decrease=0.0001,
        random_state=123,
        n_jobs=-1
    )

Source File: classifier.py From stock-price-prediction with MIT License

5 votes

def performRFClass(X_train, y_train, X_test, y_test, parameters, savemodel):
    """
    Random Forest Binary Classification
    """
    clf = RandomForestClassifier(n_estimators=1000, n_jobs=-1)
    clf.fit(X_train, y_train)

    accuracy = clf.score(X_test, y_test)

    return accuracy

Source File: pu_learning.py From LogClass with MIT License

5 votes

def instatiate_pu_adapter(params, **kwargs):
    """
        Returns a RF adapted to do PU Learning wrapped by the PUAdapterWrapper.
    """
    hparms = {
        'n_estimators': 10,
        'criterion': "entropy",
        'bootstrap': True,
        'n_jobs': -1,
    }
    hparms.update(kwargs)
    estimator = RandomForestClassifier(**hparms)
    wrapped_pu_estimator = PUAdapterWrapper(PUAdapter(estimator), params)
    return wrapped_pu_estimator

Source File: test_pipe.py From skutil with BSD 3-Clause "New" or "Revised" License

5 votes

def test_pipeline_basic():
    pipe = Pipeline([
        ('selector', FeatureRetainer(cols=['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)'])),
        ('scaler',   SelectiveScaler()),
        ('model',    RandomForestClassifier())
    ])

    pipe.fit(X, iris.target)

Source File: function.py From Karta with MIT License

5 votes

def calibrateFunctionTypeClassifier(self, scs):
        """Calibrate the type classifier, according to all known code segments.

        Args:
            scs (list): list of all known (sark) code segments

        Return Value:
            True iff the calibration was successfully and is more accurate than the assigned lower bound
        """
        functions = []
        for sc in scs:
            functions += list(filter(lambda func: not self._analyzer.fptr_identifier.isPointedFunction(func.start_ea), sc.functions))
        # 1st round - calibration
        # 2nd round - test
        for training_round in range(2):
            round_name = "Calibration" if training_round == 0 else "Testing"
            clf = RandomForestClassifier(n_estimators=100)
            eas = list(map(lambda x: x.start_ea, functions))
            data_set = list(map(self.extractFunctionTypeSample, eas))
            data_results = list(map(self._analyzer.codeType, eas))
            # split to train and test (70%, 30%)
            X_train, X_test, Y_train, Y_test = train_test_split(data_set, data_results, test_size=0.7, random_state=5)
            # classify
            clf.fit(X_train, Y_train)
            # test
            Y_pred = clf.predict(X_test)
            accuracy = metrics.accuracy_score(Y_test, Y_pred)
            self._analyzer.logger.info("%s: Function accuracy Type Accuracy: %.2f%%", round_name, accuracy * 100)
            # Pick up the best features, and use only them (only needed in the first round)
            if training_round == 0:
                type_impact = list(zip(self._classifier_type_offsets, clf.feature_importances_))
                type_impact.sort(key=lambda x: x[1], reverse=True)
                self._classifier_type_offsets = list(map(lambda x: x[0], type_impact[:self._feature_size]))
            elif accuracy < CALIBRATION_LOWER_BOUND:
                self._analyzer.logger.error("Function Prologue Type Accuracy is too low, can't continue: %.2f%% < %.2f%%", accuracy * 100, CALIBRATION_LOWER_BOUND * 100)
                return False
        # If reached this point it means that all was OK
        return True

Source File: annotation.py From scVI with MIT License

5 votes

def compute_accuracy_rf(
    data_train, labels_train, data_test, labels_test, param_grid=None, verbose=0
):
    if param_grid is None:
        param_grid = {"max_depth": np.arange(3, 10), "n_estimators": [10, 50, 100, 200]}
    rf = RandomForestClassifier(max_depth=2, random_state=0)
    clf = GridSearchCV(rf, param_grid, verbose=verbose, cv=3)
    return compute_accuracy_classifier(
        clf, data_train, labels_train, data_test, labels_test
    )

Source File: advanced_supvervised_model_trainer.py From healthcareai-py with MIT License

5 votes

def random_forest_classifier(self,
                                 trees=200,
                                 scoring_metric='roc_auc',
                                 hyperparameter_grid=None,
                                 randomized_search=True,
                                 number_iteration_samples=5):
        """
        A light wrapper for Sklearn's random forest classifier that performs 
        randomized search over an overridable
        default hyperparameter grid.
        
        Args:
            trees (int): number of trees to use if not performing a randomized 
            grid search scoring_metric (str): Any sklearn scoring metric appropriate 
            for classification hyperparameter_grid (dict): hyperparameters by name
            randomized_search (bool): True for randomized search (default)
            number_iteration_samples (int): Number of models to train during the 
            randomized search for exploring the hyperparameter space. More may lead 
            to a better model, but will take longer.

        Returns:
            TrainedSupervisedModel: 
        """
        self.validate_classification('Random Forest Classifier')
        if hyperparameter_grid is None:
            max_features = hcai_helpers.calculate_random_forest_mtry_hyperparameter(len(self.X_test.columns),
                                                                                    self.model_type)
            hyperparameter_grid = {'n_estimators': [100, 200, 300], 'max_features': max_features}
            number_iteration_samples = 5

        algorithm = get_algorithm(RandomForestClassifier,
                                  scoring_metric,
                                  hyperparameter_grid,
                                  randomized_search,
                                  number_iteration_samples=number_iteration_samples,
                                  n_estimators=trees)

        trained_supervised_model = self._create_trained_supervised_model(algorithm)

        return trained_supervised_model

Source File: transpile.py From go-ml-transpiler with Apache License 2.0

5 votes

def main(export_dir):

    ## load dataset
    x, y = load_dataset(return_X_y=True)

    ## train xgb
    xgbc = xgb.XGBClassifier(n_estimators=100, max_depth=7)
    xgbc.fit(x, y)

    # transpile model
    os.mkdir(os.path.join(export_dir, "xgb"))
    transpiler = Transpiler(xgbc)
    transpiler.transpile(package_name="xgb", method_name="predict", export_method=True)
    transpiler.write(os.path.join(export_dir, "xgb"))
    print("xgb done.")


    ## train rfc
    rfc = RFC(n_estimators=100, max_depth=7)
    rfc.fit(x, y)

    # transpile model
    os.mkdir(os.path.join(export_dir, "rfc"))
    transpiler = Transpiler(rfc)
    transpiler.transpile(package_name="rfc", method_name="predict", export_method=True)
    transpiler.write(os.path.join(export_dir, "rfc"))
    print("rfc done.")

Source File: common_utils.py From interpret-text with MIT License

5 votes

def create_random_forest_tfidf():
    vectorizer = TfidfVectorizer(lowercase=False)
    rf = RandomForestClassifier(n_estimators=500, random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("rf", rf)])

Source File: malss.py From malss with MIT License

5 votes

def select_features(self):
        if self.data is None:
            warnings.warn("'drop_col' must be used after 'fit' has used.")
            return

        if self.task == 'regression':
            rf = RandomForestRegressor(random_state=0, oob_score=True, n_estimators=50, n_jobs=self.n_jobs)
        else:
            rf = RandomForestClassifier(random_state=0, oob_score=True, n_estimators=50, n_jobs=self.n_jobs)
        
        num_col = len(self.data.X.columns)
        self.data.drop_col(rf)
        if len(self.data.X.columns) < num_col:
            self.algorithms = self.__choose_algorithm()
            self.is_ready = True

Source File: rfpimp.py From malss with MIT License

5 votes

def importances_raw(rf, X_train, y_train, n_samples=5000):
    if isinstance(rf, RandomForestClassifier):
        return permutation_importances_raw(rf, X_train, y_train, oob_classifier_accuracy, n_samples)
    elif isinstance(rf, RandomForestRegressor):
        return permutation_importances_raw(rf, X_train, y_train, oob_regression_r2_score, n_samples)
    return None

Source File: rfpimp.py From malss with MIT License

5 votes

def oob_dropcol_importances(rf, X_train, y_train):
    """
    Compute drop-column feature importances for scikit-learn.

    Given a RandomForestClassifier or RandomForestRegressor in rf
    and training X and y data, return a data frame with columns
    Feature and Importance sorted in reverse order by importance.

    A clone of rf is trained once to get the baseline score and then
    again, once per feature to compute the drop in out of bag (OOB)
    score.

    return: A data frame with Feature, Importance columns

    SAMPLE CODE

    rf = RandomForestRegressor(n_estimators=100, n_jobs=-1, oob_score=True)
    X_train, y_train = ..., ...
    rf.fit(X_train, y_train)
    imp = oob_dropcol_importances(rf, X_train, y_train)
    """
    rf_ = clone(rf)
    rf_.random_state = 999
    rf_.fit(X_train, y_train)
    baseline = rf_.oob_score_
    imp = []
    for col in X_train.columns:
        X = X_train.drop(col, axis=1)
        rf_ = clone(rf)
        rf_.random_state = 999
        rf_.fit(X, y_train)
        o = rf_.oob_score_
        imp.append(baseline - o)
    imp = np.array(imp)
    I = pd.DataFrame(data={'Feature':X_train.columns, 'Importance':imp})
    I = I.set_index('Feature')
    I = I.sort_values('Importance', ascending=False)
    return I

Python sklearn.ensemble.RandomForestClassifier() Examples