Python Examples of sklearn.grid_search.RandomizedSearchCV

Source File: Model_Parameters_CV.py From ProFET with GNU General Public License v3.0

6 votes

def GridParamSearch(param_dist, clf, X, y, n_iter_search=15) :
    '''
    Searches using rand.search for best model paramters
    diff paramters searched by model type..
    http://nbviewer.ipython.org/github/treycausey/thespread/blob/master/notebooks/basic_random_forest_wp_model.ipynb?create=1
    @param clf: estimator/predictor used.
    @param param_dist: Grid of Parameter ranges to tune for the predictor,
    using randomized CV search.
    '''
    print("Starting grid parameter search")
    random_search = RandomizedSearchCV(clf, param_distributions=param_dist,
                                       n_iter=n_iter_search,n_jobs=-1)
    start = time()
    # random_search.fit(features, target)
    random_search.fit(X, y)
    print("RandomizedSearchCV took %.2f seconds for %d candidates"
          " parameter settings." % ((time() - start), n_iter_search))
    report(random_search.grid_scores_)

Source File: test_search_2.py From spark-sklearn with Apache License 2.0

6 votes

def test_example_randomized_search(self):
        # The classic example from the sklearn documentation
        iris = datasets.load_iris()
        parameters = {'kernel': ('linear', 'rbf'), 'C': range(1, 10)}
        svr = svm.SVC()
        clf = grid_search.RandomizedSearchCV(svr, parameters, random_state=4)
        clf.fit(iris.data, iris.target)

        clf2 = RandomizedSearchCV(self.sc, svr, parameters, random_state=4)
        clf2.fit(iris.data, iris.target)

        b1 = clf.estimator
        b2 = clf2.estimator
        self.assertEqual(b1.get_params(), b2.get_params())

Source File: winfault.py From wt-fdd with GNU General Public License v3.0

5 votes

def svm_class_and_score(
    X_train, y_train, X_test, y_test, labels, search_type=RandomizedSearchCV,
    parameter_space={
        'kernel': ['linear', 'rbf', 'poly'], 'gamma': ['auto', 1e-3, 1e-4],
        'C': [0.01, .1, 1, 10, 100, 1000],
        'class_weight': [
            {0: 0.01}, {1: 1}, {1: 2}, {1: 10}, {1: 50}, 'balanced']},
        score='recall_weighted', iid=True, bagged=False, svm_results=True):
    """Build an SVM and return its scoring metrics
    """
    print("# Tuning hyper-parameters for %s" % score)
    print()

    # Find the Hyperparameters
    clf = search_type(SVC(C=1), parameter_space, cv=10,
                      scoring=score, iid=iid)

    # Build the SVM
    clf.fit(X_train, y_train)
    print("Hyperparameters found:")
    print(clf.best_params_)

    # Make the predictions
    y_pred = clf.predict(X_test)
    print()
    print()
    print("Results for basic SVM")
    clf_scoring(y_test, y_pred, labels)

    if bagged is True:
        bgg = BaggingClassifier(base_estimator=clf)
        bgg.fit(X_train, y_train)
        y_pred = bgg.predict(X_test)
        print()
        print()
        print("Results for bagging:")
        clf_scoring(y_test, y_pred, labels)
        return clf, bgg
    else:
        return clf

Source File: tunemodels.py From Supply-demand-forecasting with MIT License

5 votes

def runGridSearch(self, model):
        logging.debug("run grid search on model: {}".format(model.__class__.__name__))
        logging.debug("cross validation strategy: {}".format(model.holdout_split))
        logging.debug("used features: {}".format(model.usedFeatures))
        logging.debug("tuned parameters: {}".format(model.getTunedParamterOptions()))
        
        features,labels,cv = model.getFeaturesLabel()
        # do grid search
        if self.do_random_gridsearch:
            estimator = RandomizedSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv, n_jobs=self.n_jobs,
                       scoring=mean_absolute_percentage_error_scoring, verbose = 500, n_iter=self.n_iter_randomsearch)
        else:
            estimator = GridSearchCV(model.clf, model.getTunedParamterOptions(), cv=cv,n_jobs=-self.n_jobs, 
                                     fit_params=model.get_fit_params(),
                       scoring=mean_absolute_percentage_error_scoring, verbose = 500)
        estimator.fit(features, labels)
        model.clf = estimator.best_estimator_
        model.save_final_model = True
        model.save_model()
        
#         model.dispFeatureImportance()
        logging.debug('estimaator parameters: {}'.format(estimator.get_params))
        logging.debug('Best parameters: {}'.format(estimator.best_params_))
        logging.debug('Best Scores: {}'.format(-estimator.best_score_))
        logging.debug('Score grid: {}'.format(estimator.grid_scores_ ))
        for i in estimator.grid_scores_ :
            logging.debug('parameters: {}'.format(i.parameters ))
            logging.debug('mean_validation_score: {}'.format(np.absolute(i.mean_validation_score)))
            logging.debug('cv_validation_scores: {}'.format(np.absolute(i.cv_validation_scores) ))

        
        
        return

Source File: classification.py From pyImSegm with BSD 3-Clause "New" or "Revised" License

5 votes

def create_classif_search(name_clf, clf_pipeline, nb_labels,
                          search_type='random', cross_val=10,
                          eval_metric='f1', nb_iter=250, nb_workers=5):
    """ create sklearn search depending on spec. random or grid

    :param int nb_labels: number of labels
    :param str search_type: hyper-params search type
    :param str eval_metric: evaluation metric
    :param int nb_iter: for random number of tries
    :param str name_clf: name of classif.
    :param obj clf_pipeline: object
    :param obj cross_val: obj specific CV for fix train-test
    :param int nb_workers: number jobs running in parallel
    :return:
    """
    score_weight = 'weighted' if nb_labels > 2 else 'binary'
    scoring = metrics.make_scorer(DICT_SCORING[eval_metric.lower()],
                                  average=score_weight)
    if search_type == 'grid':
        clf_parameters = create_clf_param_search_grid(name_clf)
        logging.info('init Grid search...')
        clf_search = GridSearchCV(
            clf_pipeline, clf_parameters, scoring=scoring, cv=cross_val,
            n_jobs=nb_workers, verbose=1, refit=True)
    else:
        clf_parameters = create_clf_param_search_distrib(name_clf)
        nb_iter = search_params_cut_down_max_nb_iter(clf_parameters, nb_iter)
        logging.info('init Randomized search...')
        clf_search = RandomizedSearchCV(
            clf_pipeline, clf_parameters, scoring=scoring, cv=cross_val,
            n_jobs=nb_workers, n_iter=nb_iter, verbose=1, refit=True)
    return clf_search

Source File: test_sklearn.py From scikit-neuralnetwork with BSD 3-Clause "New" or "Revised" License

5 votes

def test_RandomGlobalParams(self):
        clf = RandomizedSearchCV(
                    self.__estimator__(layers=[L("Sigmoid")], n_iter=1),
                    param_distributions={'learning_rate': uniform(0.001, 0.01)},
                    n_iter=2)
        clf.fit(self.a_in, self.a_out)

Source File: test_sklearn.py From scikit-neuralnetwork with BSD 3-Clause "New" or "Revised" License

5 votes

def test_RandomLayerParams(self):
        clf = RandomizedSearchCV(
                    self.__estimator__(layers=[L("Rectifier", units=12), L(self.__output__)], n_iter=1),
                    param_distributions={'hidden0__units': randint(4, 12)},
                    n_iter=2)
        clf.fit(self.a_in, self.a_out)

Source File: test_sklearn.py From scikit-neuralnetwork with BSD 3-Clause "New" or "Revised" License

5 votes

def test_RandomMultipleJobs(self):
        clf = RandomizedSearchCV(
                    self.__estimator__(layers=[L("Sigmoid", units=12), L(self.__output__)], n_iter=1),
                    param_distributions={'hidden0__units': randint(4, 12)},
                    n_iter=4, n_jobs=4)
        clf.fit(self.a_in, self.a_out)

Source File: test_search_2.py From spark-sklearn with Apache License 2.0

5 votes

def test_cv_linreg(self):
        pipeline = SKL_Pipeline([
            ('lasso', SKL_Lasso(max_iter=1))
        ])
        parameters = {
            'lasso__alpha': np.linspace(0.001, 0.01, 1000)
        }
        n_iter = 10
        grid_search = RandomizedSearchCV(self.sc, pipeline, parameters, n_iter=n_iter)
        X = scipy.sparse.vstack(map(lambda x: self.list2csr([x, x+1.0]), range(0, 100)))
        y = np.array(list(range(0, 100))).reshape((100, 1))
        skl_gs = grid_search.fit(X, y)
        assert len(skl_gs.cv_results_['params']) == n_iter

Source File: ml.py From EDeN with MIT License

4 votes

def fit_estimator(estimator,
                  positive_data_matrix=None,
                  negative_data_matrix=None,
                  target=None,
                  cv=10,
                  n_jobs=-1,
                  n_iter_search=40,
                  random_state=1):
    """fit_estimator."""
    # hyperparameter optimization
    param_dist = {"n_iter": randint(5, 100),
                  "power_t": uniform(0.1),
                  "alpha": uniform(1e-08, 1e-03),
                  "eta0": uniform(1e-03, 1),
                  "penalty": ["l1", "l2", "elasticnet"],
                  "learning_rate": ["invscaling", "constant", "optimal"]}
    scoring = 'roc_auc'
    n_iter_search = n_iter_search
    random_search = RandomizedSearchCV(estimator,
                                       param_distributions=param_dist,
                                       n_iter=n_iter_search,
                                       cv=cv,
                                       scoring=scoring,
                                       n_jobs=n_jobs,
                                       random_state=random_state,
                                       refit=True)
    X, y = make_data_matrix(positive_data_matrix=positive_data_matrix,
                            negative_data_matrix=negative_data_matrix,
                            target=target)
    random_search.fit(X, y)

    logger.debug('\nClassifier:')
    logger.debug('%s' % random_search.best_estimator_)
    logger.debug('\nPredictive performance:')
    # assess the generalization capacity of the model via a 10-fold cross
    # validation
    scoring_strings = ['accuracy', 'precision', 'recall', 'f1',
                       'average_precision', 'roc_auc']
    for scoring in scoring_strings:
        scores = cross_validation.cross_val_score(
            random_search.best_estimator_,
            X,
            y,
            cv=cv,
            scoring=scoring,
            n_jobs=n_jobs)
        logger.debug('%20s: %.3f +- %.3f' %
                     (scoring, np.mean(scores), np.std(scores)))

    return random_search.best_estimator_

Source File: test_big.py From skutil with BSD 3-Clause "New" or "Revised" License

4 votes

def test_large_grid():
        """In this test, we purposely overfit a RandomForest to completely random data
        in order to assert that the test error will far supercede the train error.
        """

        if not SK18:
            custom_cv = KFold(n=y_train.shape[0], n_folds=3, shuffle=True, random_state=42)
        else:
            custom_cv = KFold(n_splits=3, shuffle=True, random_state=42)

        # define the pipe
        pipe = Pipeline([
            ('scaler', SelectiveScaler()),
            ('pca', SelectivePCA(weight=True)),
            ('rf', RandomForestClassifier(random_state=42))
        ])

        # define hyper parameters
        hp = {
            'scaler__scaler': [StandardScaler(), RobustScaler(), MinMaxScaler()],
            'pca__whiten': [True, False],
            'pca__weight': [True, False],
            'pca__n_components': uniform(0.75, 0.15),
            'rf__n_estimators': randint(5, 10),
            'rf__max_depth': randint(5, 15)
        }

        # define the grid
        grid = RandomizedSearchCV(pipe, hp, n_iter=2, scoring='accuracy', n_jobs=1, cv=custom_cv, random_state=42)

        # this will fail because we haven't fit yet
        assert_fails(grid.score, (ValueError, AttributeError), X_train, y_train)

        # fit the grid
        grid.fit(X_train, y_train)

        # score for coverage -- this might warn...
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            grid.score(X_train, y_train)

        # coverage:
        assert grid._estimator_type == 'classifier'

        # get predictions
        tr_pred, te_pred = grid.predict(X_train), grid.predict(X_test)

        # evaluate score (SHOULD be better than random...)
        accuracy_score(y_train, tr_pred), accuracy_score(y_test, te_pred)

        # grid score reports:
        # assert fails for bad percentile
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 0.0})
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'percentile': 1.0})

        # assert fails for bad y_axis
        assert_fails(report_grid_score_detail, ValueError, **{'random_search': grid, 'y_axis': 'bad_axis'})

        # assert passes otherwise
        report_grid_score_detail(grid, charts=True, percentile=0.95)  # just ensure percentile works

Source File: mnist_parallel.py From mHTM with MIT License

4 votes

def main_local(log_dir, ntrain=800, ntest=200, niter=5, nsplits=3,
	global_inhibition=True, ncores=4, seed=None):
	"""
	Perform CV on a subset of the MNIST dataset. Performs parallelizations on
	a local machine.
	
	@param log_dir: The directory to store the results in.
	
	@param ntrain: The number of training samples to use.
	
	@param ntest: The number of testing samples to use.
	
	@param niter: The number of parameter iterations to use.
	
	@param nsplits: The number of splits of the data to use.
	
	@param global_inhibition: If True use global inhibition; otherwise, use
	local inhibition.
	
	@param ncores: The number of cores to use.
	
	@param seed: The seed for the random number generators.
	"""
	
	# Run the initialization
	x, y, kargs, params, cv = main(log_dir, ntrain, ntest, niter, nsplits,
		seed)
	
	# Build the classifier for doing CV
	clf = RandomizedSearchCV(
		estimator=SPRegion(**kargs),
		param_distributions=params,
		n_iter=niter, # Total runs
		n_jobs=ncores, # Use this many number of cores
		pre_dispatch=1 * ncores, # Give each core two jobs at a time
		iid=True, # Data is iid across folds
		cv=cv, # The CV split for the data
		refit=False, # Disable fitting best estimator on full dataset
		random_state=seed # Force same SP across runs
	)
	
	# Fit the models
	clf.fit(x, y)
	
	# Extract the CV results
	parameter_names = sorted(clf.grid_scores_[0].parameters.keys())
	parameter_names.pop(parameter_names.index('log_dir'))
	parameter_values = np.zeros((niter, len(parameter_names)))
	results = np.zeros((niter, nsplits))
	for i, score in enumerate(clf.grid_scores_):
		parameter_values[i] = np.array([score.parameters[k] for k in
			parameter_names])
		results[i] = score.cv_validation_scores
	
	# Save the CV results
	with open(os.path.join(log_dir, 'cv_results.pkl'), 'wb') as f:
		cPickle.dump((parameter_names, parameter_values, results), f,
			cPickle.HIGHEST_PROTOCOL)
	with open(os.path.join(log_dir, 'cv_clf.pkl'), 'wb') as f:
		cPickle.dump((clf.grid_scores_, clf.best_score_, clf.best_params_), f,
			cPickle.HIGHEST_PROTOCOL)

Python sklearn.grid_search.RandomizedSearchCV() Examples