Python sklearn.pipeline.Pipeline() Examples
The following are 30
code examples of sklearn.pipeline.Pipeline().
Example #1
Source File: From rasa_lookup_demo with Apache License 2.0 | 8 votes |
def run_logreg(X_train, y_train, selection_threshold=0.2): print("\nrunning logistic regression...") print("using a selection threshold of {}".format(selection_threshold)) pipe = Pipeline( [ ( "feature_selection", RandomizedLogisticRegression(selection_threshold=selection_threshold), ), ("classification", LogisticRegression()), ] ), y_train) print("training accuracy : {}".format(pipe.score(X_train, y_train))) print("testing accuracy : {}".format(pipe.score(X_test, y_test))) return pipe
Example #2
Source File: From gordo with GNU Affero General Public License v3.0 | 8 votes |
def test_keras_autoencoder_scoring(model, kind, n_features_out): """ Test the KerasAutoEncoder and KerasLSTMAutoEncoder have a working scoring function """ Model = pydoc.locate(f"gordo.machine.model.models.{model}") model = Pipeline([("model", Model(kind=kind))]) X = np.random.random((8, 2)) # Should be able to deal with y output different than X input features y = np.random.random((8, n_features_out)) with pytest.raises(NotFittedError): model.score(X, y), y) score = model.score(X, y)"Score: {score:.4f}")
Example #3
Source File: From stability-selection with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_stability_selection_regression(): n, p, k = 500, 1000, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', Lasso()) ]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid), y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(important_betas, chosen_betas)
Example #4
Source File: From stability-selection with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_with_complementary_pairs_bootstrap(): n, p, k = 500, 1000, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', Lasso()) ]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid, bootstrap_func='complementary_pairs'), y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(important_betas, chosen_betas)
Example #5
Source File: From stability-selection with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_different_shape(): n, p, k = 100, 200, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', Lasso()) ]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid), y) selector.transform(X[:, :-2])
Example #6
Source File: From stability-selection with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_no_features(): n, p, k = 100, 200, 0 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', Lasso()) ]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid), y) assert_almost_equal(selector.transform(X), np.empty(0).reshape((X.shape[0], 0)))
Example #7
Source File: From healthcareai-py with MIT License | 6 votes |
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True, imputeStrategy='MeanMode', tunedRandomForest=False, numeric_columns_as_categorical=None): """ Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data. Note advanced users may wish to use their own custom pipeline. """ # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for # inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays. pipeline = Pipeline([ ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()), ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)), # Perform one of two basic imputation methods # TODO we need to think about making this optional to solve the problem of rare and very predictive values ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical)), ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)), ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)), ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)), ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])), ]) return pipeline
Example #8
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = sparse_random_matrix(100, 100, density=0.10) missing_values =[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"] } Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters), Y)
Example #9
Source File: From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def make_pipeline(encoding_method): # static transformers from the other columns transformers = [(enc + '_' + col, encoders_dict[enc], [col]) for col, enc in clean_columns.items()] # adding the encoded column transformers += [(encoding_method, encoders_dict[encoding_method], [dirty_column])] pipeline = Pipeline([ # Use ColumnTransformer to combine the features ('union', ColumnTransformer( transformers=transformers, remainder='drop')), ('scaler', StandardScaler(with_mean=False)), ('clf', RidgeCV()) ]) return pipeline ######################################################################### # Fitting each encoding methods with a RidgeCV # -------------------------------------------- # Eventually, we loop over the different encoding methods, # instantiate each time a new pipeline, fit it # and store the returned cross-validation score:
Example #10
Source File: From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def make_pipeline(encoding_method): # static transformers from the other columns transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)] # adding the encoded column transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method], [dirty_column])] pipeline = Pipeline([ # Use ColumnTransformer to combine the features ('union', ColumnTransformer( transformers=transformers, remainder='drop')), ('scaler', StandardScaler(with_mean=False)), ('classifier', RandomForestClassifier(random_state=5)) ]) return pipeline ############################################################################### # Evaluation of different encoding methods # ----------------------------------------- # We then loop over encoding methods, scoring the different pipeline predictions # using a cross validation score:
Example #11
Source File: From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 6 votes |
def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion( [('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
Example #12
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_set_params_passes_all_parameters(): # Make sure all parameters are passed together to set_params # of nested estimator. Regression test for #9944 class TestDecisionTree(DecisionTreeClassifier): def set_params(self, **kwargs): super().set_params(**kwargs) # expected_kwargs is in test scope assert kwargs == expected_kwargs return self expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2} for est in [Pipeline([('estimator', TestDecisionTree())]), GridSearchCV(TestDecisionTree(), {})]: est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2)
Example #13
Source File: From gordo with GNU Affero General Public License v3.0 | 6 votes |
def load(source_dir: Union[os.PathLike, str]) -> Any: """ Load an object from a directory, saved by ``gordo.serializer.pipeline_serializer.dump`` This take a directory, which is either top-level, meaning it contains a sub directory in the naming scheme: "n_step=<int>-class=<>" or the aforementioned naming scheme directory directly. Will return that unsterilized object. Parameters ---------- source_dir: Union[os.PathLike, str] Location of the top level dir the pipeline was saved Returns ------- Union[GordoBase, Pipeline, BaseEstimator] """ # This source dir should have a single pipeline entry directory. # may have been passed a top level dir, containing such an entry: with open(os.path.join(source_dir, "model.pkl"), "rb") as f: return pickle.load(f)
Example #14
Source File: From fake-news-detection with MIT License | 6 votes |
def run(self): ''' Runs a model with params p. ''' self.clf.set_params(**self.params) # f = get_feature_transformer(self.parser) # self.X_train_fts = f.fit_transform(self.X_train) # self.X_test_fts = f.transform(self.X_test) self.pipeline = Pipeline([ # ('feature_gen', f), ('clf', self.clf), ]) self.y_pred_probs =,self.y_train).predict_proba(self.X_test)[:,1] if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']: self.importances = self.clf.feature_importances_ elif self.model_type in ['SVM', 'LR', 'SGD']: self.importances = self.clf.coef_[0]
Example #15
Source File: From CalibrationNN with GNU General Public License v3.0 | 6 votes |
def pca(self, **kwargs): if 'n_components' in kwargs: nComp = kwargs['n_components'] else: nComp = 0.995 if 'dates' in kwargs: mat = self.to_matrix(kwargs['dates']) else: mat = self.to_matrix() scaler = StandardScaler() pca = PCA(n_components=nComp) self._pipeline = Pipeline([('scaler', scaler), ('pca', pca)]) if 'file' in kwargs: tofile(kwargs['file'], self._pipeline) return self._pipeline
Example #16
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_gridsearch_pipeline_precomputed(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model using a precomputed kernel. X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) kpca = KernelPCA(kernel="precomputed", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(Perceptron__max_iter=np.arange(1, 5)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) X_kernel = rbf_kernel(X, gamma=2.), y) assert_equal(grid_search.best_score_, 1) # 0.23. warning about tol not having its correct default value.
Example #17
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_cv_pipeline_precomputed(): # Cross-validate a regression on four coplanar points with the same # value. Use precomputed kernel to ensure Pipeline with KernelCenterer # is treated as a _pairwise operation. X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]]) y_true = np.ones((4,)) K = kcent = KernelCenterer() pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR(gamma='scale'))]) # did the pipeline set the _pairwise attribute? assert pipeline._pairwise # test cross-validation, score should be almost perfect # NB: this test is pretty vacuous -- it's mainly to test integration # of Pipeline and KernelCenterer y_pred = cross_val_predict(pipeline, K, y_true, cv=2) assert_array_almost_equal(y_true, y_pred)
Example #18
Source File: From interpret-text with MIT License | 6 votes |
def create_logistic_vectorizer(): vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True) lr = LogisticRegression(random_state=777) return Pipeline([("vectorizer", vectorizer), ("lr", lr)])
Example #19
Source File: From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, n_neighbors=5, weights='uniform', leaf_size=30, metric='minkowski', normalize=True): if metric == 'cosine': metric = lambda x,y: dist_utils._cosine_sim(x, y) knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, leaf_size=leaf_size, metric=metric) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('knn', knn)]) else: self.model = knn
Example #20
Source File: From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 5 votes |
def accuracy(features, labels): from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn import cross_validation # We use logistic regression because it is very fast. # Feel free to experiment with other classifiers clf = Pipeline([('preproc', StandardScaler()), ('classifier', LogisticRegression())]) cv = cross_validation.LeaveOneOut(len(features)) scores = cross_validation.cross_val_score( clf, features, labels, cv=cv) return scores.mean()
Example #21
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_grid_search_allows_nans(): # Test GridSearchCV with SimpleImputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)), ('classifier', MockClassifier()), ]) GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
Example #22
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ovr_pipeline(): # Test with pipeline of length one # This test is needed because the multiclass estimators may fail to detect # the presence of predict_proba or decision_function. clf = Pipeline([("tree", DecisionTreeClassifier())]) ovr_pipe = OneVsRestClassifier(clf), ovr = OneVsRestClassifier(DecisionTreeClassifier()), assert_array_equal(ovr.predict(, ovr_pipe.predict(
Example #23
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_countvectorizer_custom_vocabulary_pipeline(): what_we_like = ["pizza", "beer"] pipe = Pipeline([ ('count', CountVectorizer(vocabulary=what_we_like)), ('tfidf', TfidfTransformer())]) X = pipe.fit_transform(ALL_FOOD_DOCS) assert_equal(set(pipe.named_steps['count'].vocabulary_), set(what_we_like)) assert_equal(X.shape[1], len(what_we_like))
Example #24
Source File: From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_is_classifier(): svc = SVC() assert is_classifier(svc) assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]})) assert is_classifier(Pipeline([('svc', svc)])) assert is_classifier(Pipeline( [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))
Example #25
Source File: From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, epsilon=0.0, C=1.0, loss='epsilon_insensitive', random_state=None, normalize=True): lsvr = sklearn.svm.LinearSVR(epsilon=epsilon, C=C, loss=loss, random_state=random_state) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('lsvr', lsvr)]) else: self.model = lsvr
Example #26
Source File: From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, kernel='rbf', degree=3, gamma='auto', C=1.0, epsilon=0.1, normalize=True, cache_size=2048): svr = sklearn.svm.SVR(kernel=kernel, degree=degree, gamma=gamma, C=C, epsilon=epsilon) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('svr', svr)]) else: self.model = svr
Example #27
Source File: From gordo with GNU Affero General Public License v3.0 | 5 votes |
def loads(bytes_object: bytes) -> GordoBase: """ Load a GordoBase model from bytes dumped from ``gordo.serializer.dumps`` Parameters ---------- bytes_object: bytes Bytes to be loaded, should be the result of `serializer.dumps(model)` Returns ------- Union[GordoBase, Pipeline, BaseEstimator] Custom gordo model, scikit learn pipeline or other scikit learn like object. """ return pickle.loads(bytes_object)
Example #28
Source File: From gordo with GNU Affero General Public License v3.0 | 5 votes |
def dumps(model: Union[Pipeline, GordoBase]) -> bytes: """ Dump a model into a bytes representation suitable for loading from ``gordo.serializer.loads`` Parameters ---------- model: Union[Pipeline, GordoBase] A gordo model/pipeline Returns ------- bytes Serialized model which supports loading via ``serializer.loads()`` Example ------- >>> from gordo.machine.model.models import KerasAutoEncoder >>> from gordo import serializer >>> >>> model = KerasAutoEncoder('feedforward_symmetric') >>> serialized = serializer.dumps(model) >>> assert isinstance(serialized, bytes) >>> >>> model_clone = serializer.loads(serialized) >>> assert isinstance(model_clone, KerasAutoEncoder) """ return pickle.dumps(model)
Example #29
Source File: From gordo with GNU Affero General Public License v3.0 | 5 votes |
def _build_scikit_branch( definition: Iterable[Union[str, Dict[Any, Any]]], constructor_class=Union[Pipeline, None], ): """ Exactly like :func:`~_build_branch` except it's expected this is going to be a list of tuples, where the 0th element is the name of the step. """ steps = [(f"step_{i}", _build_step(step)) for i, step in enumerate(definition)] return steps if constructor_class is None else constructor_class(steps)
Example #30
Source File: From gordo with GNU Affero General Public License v3.0 | 5 votes |
def get_model_output(model: Pipeline, X: np.ndarray) -> np.ndarray: """ Get the raw output from the current model given X. Will try to `predict` and then `transform`, raising an error if both fail. Parameters ---------- X: np.ndarray 2d array of sample(s) Returns ------- np.ndarray The raw output of the model in numpy array form. """ try: return model.predict(X) # type: ignore # Model may only be a transformer except AttributeError: try: return model.transform(X) # type: ignore except Exception as exc: logger.error(f"Failed to predict or transform; error: {exc}") raise