Python sklearn.pipeline.Pipeline() Examples
The following are 30
code examples of sklearn.pipeline.Pipeline().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.pipeline
, or try the search function
.
Example #1
Source File: create_ngrams.py From rasa_lookup_demo with Apache License 2.0 | 8 votes |
def run_logreg(X_train, y_train, selection_threshold=0.2): print("\nrunning logistic regression...") print("using a selection threshold of {}".format(selection_threshold)) pipe = Pipeline( [ ( "feature_selection", RandomizedLogisticRegression(selection_threshold=selection_threshold), ), ("classification", LogisticRegression()), ] ) pipe.fit(X_train, y_train) print("training accuracy : {}".format(pipe.score(X_train, y_train))) print("testing accuracy : {}".format(pipe.score(X_test, y_test))) return pipe
Example #2
Source File: test_model.py From gordo with GNU Affero General Public License v3.0 | 7 votes |
def test_keras_autoencoder_scoring(model, kind, n_features_out): """ Test the KerasAutoEncoder and KerasLSTMAutoEncoder have a working scoring function """ Model = pydoc.locate(f"gordo.machine.model.models.{model}") model = Pipeline([("model", Model(kind=kind))]) X = np.random.random((8, 2)) # Should be able to deal with y output different than X input features y = np.random.random((8, n_features_out)) with pytest.raises(NotFittedError): model.score(X, y) model.fit(X, y) score = model.score(X, y) logger.info(f"Score: {score:.4f}")
Example #3
Source File: test_stability_selection.py From stability-selection with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_stability_selection_regression(): n, p, k = 500, 1000, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', Lasso()) ]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(important_betas, chosen_betas)
Example #4
Source File: test_stability_selection.py From stability-selection with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_with_complementary_pairs_bootstrap(): n, p, k = 500, 1000, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', Lasso()) ]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid, bootstrap_func='complementary_pairs') selector.fit(X, y) chosen_betas = selector.get_support(indices=True) assert_almost_equal(important_betas, chosen_betas)
Example #5
Source File: test_stability_selection.py From stability-selection with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_different_shape(): n, p, k = 100, 200, 5 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', Lasso()) ]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) selector.transform(X[:, :-2])
Example #6
Source File: test_stability_selection.py From stability-selection with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_no_features(): n, p, k = 100, 200, 0 X, y, important_betas = _generate_dummy_regression_data(n=n, k=k) base_estimator = Pipeline([ ('scaler', StandardScaler()), ('model', Lasso()) ]) lambdas_grid = np.logspace(-1, 1, num=10) selector = StabilitySelection(base_estimator=base_estimator, lambda_name='model__alpha', lambda_grid=lambdas_grid) selector.fit(X, y) assert_almost_equal(selector.transform(X), np.empty(0).reshape((X.shape[0], 0)))
Example #7
Source File: data_preparation.py From healthcareai-py with MIT License | 6 votes |
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True, imputeStrategy='MeanMode', tunedRandomForest=False, numeric_columns_as_categorical=None): """ Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data. Note advanced users may wish to use their own custom pipeline. """ # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for # inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays. pipeline = Pipeline([ ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()), ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)), # Perform one of two basic imputation methods # TODO we need to think about making this optional to solve the problem of rare and very predictive values ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical)), ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)), ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)), ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)), ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])), ]) return pipeline
Example #8
Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_imputation_pipeline_grid_search(): # Test imputation within a pipeline + gridsearch. X = sparse_random_matrix(100, 100, density=0.10) missing_values = X.data[0] pipeline = Pipeline([('imputer', SimpleImputer(missing_values=missing_values)), ('tree', tree.DecisionTreeRegressor(random_state=0))]) parameters = { 'imputer__strategy': ["mean", "median", "most_frequent"] } Y = sparse_random_matrix(100, 1, density=0.10).toarray() gs = GridSearchCV(pipeline, parameters) gs.fit(X, Y)
Example #9
Source File: 02_fit_predict_plot_employee_salaries.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def make_pipeline(encoding_method): # static transformers from the other columns transformers = [(enc + '_' + col, encoders_dict[enc], [col]) for col, enc in clean_columns.items()] # adding the encoded column transformers += [(encoding_method, encoders_dict[encoding_method], [dirty_column])] pipeline = Pipeline([ # Use ColumnTransformer to combine the features ('union', ColumnTransformer( transformers=transformers, remainder='drop')), ('scaler', StandardScaler(with_mean=False)), ('clf', RidgeCV()) ]) return pipeline ######################################################################### # Fitting each encoding methods with a RidgeCV # -------------------------------------------- # Eventually, we loop over the different encoding methods, # instantiate each time a new pipeline, fit it # and store the returned cross-validation score:
Example #10
Source File: 03_fit_predict_plot_midwest_survey.py From dirty_cat with BSD 3-Clause "New" or "Revised" License | 6 votes |
def make_pipeline(encoding_method): # static transformers from the other columns transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)] # adding the encoded column transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method], [dirty_column])] pipeline = Pipeline([ # Use ColumnTransformer to combine the features ('union', ColumnTransformer( transformers=transformers, remainder='drop')), ('scaler', StandardScaler(with_mean=False)), ('classifier', RandomForestClassifier(random_state=5)) ]) return pipeline ############################################################################### # Evaluation of different encoding methods # ----------------------------------------- # We then loop over encoding methods, scoring the different pipeline predictions # using a cross validation score:
Example #11
Source File: 04_sent.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 6 votes |
def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion( [('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
Example #12
Source File: test_base.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_set_params_passes_all_parameters(): # Make sure all parameters are passed together to set_params # of nested estimator. Regression test for #9944 class TestDecisionTree(DecisionTreeClassifier): def set_params(self, **kwargs): super().set_params(**kwargs) # expected_kwargs is in test scope assert kwargs == expected_kwargs return self expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2} for est in [Pipeline([('estimator', TestDecisionTree())]), GridSearchCV(TestDecisionTree(), {})]: est.set_params(estimator__max_depth=5, estimator__min_samples_leaf=2)
Example #13
Source File: serializer.py From gordo with GNU Affero General Public License v3.0 | 6 votes |
def load(source_dir: Union[os.PathLike, str]) -> Any: """ Load an object from a directory, saved by ``gordo.serializer.pipeline_serializer.dump`` This take a directory, which is either top-level, meaning it contains a sub directory in the naming scheme: "n_step=<int>-class=<path.to.Class>" or the aforementioned naming scheme directory directly. Will return that unsterilized object. Parameters ---------- source_dir: Union[os.PathLike, str] Location of the top level dir the pipeline was saved Returns ------- Union[GordoBase, Pipeline, BaseEstimator] """ # This source dir should have a single pipeline entry directory. # may have been passed a top level dir, containing such an entry: with open(os.path.join(source_dir, "model.pkl"), "rb") as f: return pickle.load(f)
Example #14
Source File: model.py From fake-news-detection with MIT License | 6 votes |
def run(self): ''' Runs a model with params p. ''' self.clf.set_params(**self.params) # f = get_feature_transformer(self.parser) # self.X_train_fts = f.fit_transform(self.X_train) # self.X_test_fts = f.transform(self.X_test) self.pipeline = Pipeline([ # ('feature_gen', f), ('clf', self.clf), ]) self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1] if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']: self.importances = self.clf.feature_importances_ elif self.model_type in ['SVM', 'LR', 'SGD']: self.importances = self.clf.coef_[0]
Example #15
Source File: data_utils.py From CalibrationNN with GNU General Public License v3.0 | 6 votes |
def pca(self, **kwargs): if 'n_components' in kwargs: nComp = kwargs['n_components'] else: nComp = 0.995 if 'dates' in kwargs: mat = self.to_matrix(kwargs['dates']) else: mat = self.to_matrix() scaler = StandardScaler() pca = PCA(n_components=nComp) self._pipeline = Pipeline([('scaler', scaler), ('pca', pca)]) self._pipeline.fit(mat) if 'file' in kwargs: tofile(kwargs['file'], self._pipeline) return self._pipeline
Example #16
Source File: test_kernel_pca.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_gridsearch_pipeline_precomputed(): # Test if we can do a grid-search to find parameters to separate # circles with a perceptron model using a precomputed kernel. X, y = make_circles(n_samples=400, factor=.3, noise=.05, random_state=0) kpca = KernelPCA(kernel="precomputed", n_components=2) pipeline = Pipeline([("kernel_pca", kpca), ("Perceptron", Perceptron(max_iter=5))]) param_grid = dict(Perceptron__max_iter=np.arange(1, 5)) grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid) X_kernel = rbf_kernel(X, gamma=2.) grid_search.fit(X_kernel, y) assert_equal(grid_search.best_score_, 1) # 0.23. warning about tol not having its correct default value.
Example #17
Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_cv_pipeline_precomputed(): # Cross-validate a regression on four coplanar points with the same # value. Use precomputed kernel to ensure Pipeline with KernelCenterer # is treated as a _pairwise operation. X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]]) y_true = np.ones((4,)) K = X.dot(X.T) kcent = KernelCenterer() pipeline = Pipeline([("kernel_centerer", kcent), ("svr", SVR(gamma='scale'))]) # did the pipeline set the _pairwise attribute? assert pipeline._pairwise # test cross-validation, score should be almost perfect # NB: this test is pretty vacuous -- it's mainly to test integration # of Pipeline and KernelCenterer y_pred = cross_val_predict(pipeline, K, y_true, cv=2) assert_array_almost_equal(y_true, y_pred)
Example #18
Source File: common_utils.py From interpret-text with MIT License | 6 votes |
def create_logistic_vectorizer(): vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True) lr = LogisticRegression(random_state=777) return Pipeline([("vectorizer", vectorizer), ("lr", lr)])
Example #19
Source File: skl_utils.py From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, n_neighbors=5, weights='uniform', leaf_size=30, metric='minkowski', normalize=True): if metric == 'cosine': metric = lambda x,y: dist_utils._cosine_sim(x, y) knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, leaf_size=leaf_size, metric=metric) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('knn', knn)]) else: self.model = knn
Example #20
Source File: image-classification.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 5 votes |
def accuracy(features, labels): from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline from sklearn.preprocessing import StandardScaler from sklearn import cross_validation # We use logistic regression because it is very fast. # Feel free to experiment with other classifiers clf = Pipeline([('preproc', StandardScaler()), ('classifier', LogisticRegression())]) cv = cross_validation.LeaveOneOut(len(features)) scores = cross_validation.cross_val_score( clf, features, labels, cv=cv) return scores.mean()
Example #21
Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_grid_search_allows_nans(): # Test GridSearchCV with SimpleImputer X = np.arange(20, dtype=np.float64).reshape(5, -1) X[2, :] = np.nan y = [0, 0, 1, 1, 1] p = Pipeline([ ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)), ('classifier', MockClassifier()), ]) GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)
Example #22
Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_ovr_pipeline(): # Test with pipeline of length one # This test is needed because the multiclass estimators may fail to detect # the presence of predict_proba or decision_function. clf = Pipeline([("tree", DecisionTreeClassifier())]) ovr_pipe = OneVsRestClassifier(clf) ovr_pipe.fit(iris.data, iris.target) ovr = OneVsRestClassifier(DecisionTreeClassifier()) ovr.fit(iris.data, iris.target) assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))
Example #23
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_countvectorizer_custom_vocabulary_pipeline(): what_we_like = ["pizza", "beer"] pipe = Pipeline([ ('count', CountVectorizer(vocabulary=what_we_like)), ('tfidf', TfidfTransformer())]) X = pipe.fit_transform(ALL_FOOD_DOCS) assert_equal(set(pipe.named_steps['count'].vocabulary_), set(what_we_like)) assert_equal(X.shape[1], len(what_we_like))
Example #24
Source File: test_base.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_is_classifier(): svc = SVC() assert is_classifier(svc) assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]})) assert is_classifier(Pipeline([('svc', svc)])) assert is_classifier(Pipeline( [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))
Example #25
Source File: skl_utils.py From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, epsilon=0.0, C=1.0, loss='epsilon_insensitive', random_state=None, normalize=True): lsvr = sklearn.svm.LinearSVR(epsilon=epsilon, C=C, loss=loss, random_state=random_state) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('lsvr', lsvr)]) else: self.model = lsvr
Example #26
Source File: skl_utils.py From kaggle-HomeDepot with MIT License | 5 votes |
def __init__(self, kernel='rbf', degree=3, gamma='auto', C=1.0, epsilon=0.1, normalize=True, cache_size=2048): svr = sklearn.svm.SVR(kernel=kernel, degree=degree, gamma=gamma, C=C, epsilon=epsilon) if normalize: self.model = Pipeline([('ss', StandardScaler()), ('svr', svr)]) else: self.model = svr
Example #27
Source File: serializer.py From gordo with GNU Affero General Public License v3.0 | 5 votes |
def loads(bytes_object: bytes) -> GordoBase: """ Load a GordoBase model from bytes dumped from ``gordo.serializer.dumps`` Parameters ---------- bytes_object: bytes Bytes to be loaded, should be the result of `serializer.dumps(model)` Returns ------- Union[GordoBase, Pipeline, BaseEstimator] Custom gordo model, scikit learn pipeline or other scikit learn like object. """ return pickle.loads(bytes_object)
Example #28
Source File: serializer.py From gordo with GNU Affero General Public License v3.0 | 5 votes |
def dumps(model: Union[Pipeline, GordoBase]) -> bytes: """ Dump a model into a bytes representation suitable for loading from ``gordo.serializer.loads`` Parameters ---------- model: Union[Pipeline, GordoBase] A gordo model/pipeline Returns ------- bytes Serialized model which supports loading via ``serializer.loads()`` Example ------- >>> from gordo.machine.model.models import KerasAutoEncoder >>> from gordo import serializer >>> >>> model = KerasAutoEncoder('feedforward_symmetric') >>> serialized = serializer.dumps(model) >>> assert isinstance(serialized, bytes) >>> >>> model_clone = serializer.loads(serialized) >>> assert isinstance(model_clone, KerasAutoEncoder) """ return pickle.dumps(model)
Example #29
Source File: from_definition.py From gordo with GNU Affero General Public License v3.0 | 5 votes |
def _build_scikit_branch( definition: Iterable[Union[str, Dict[Any, Any]]], constructor_class=Union[Pipeline, None], ): """ Exactly like :func:`~_build_branch` except it's expected this is going to be a list of tuples, where the 0th element is the name of the step. """ steps = [(f"step_{i}", _build_step(step)) for i, step in enumerate(definition)] return steps if constructor_class is None else constructor_class(steps)
Example #30
Source File: model_io.py From gordo with GNU Affero General Public License v3.0 | 5 votes |
def get_model_output(model: Pipeline, X: np.ndarray) -> np.ndarray: """ Get the raw output from the current model given X. Will try to `predict` and then `transform`, raising an error if both fail. Parameters ---------- X: np.ndarray 2d array of sample(s) Returns ------- np.ndarray The raw output of the model in numpy array form. """ try: return model.predict(X) # type: ignore # Model may only be a transformer except AttributeError: try: return model.transform(X) # type: ignore except Exception as exc: logger.error(f"Failed to predict or transform; error: {exc}") raise