Python Examples of sklearn.pipeline.Pipeline

Source File: create_ngrams.py From rasa_lookup_demo with Apache License 2.0

8 votes

def run_logreg(X_train, y_train, selection_threshold=0.2):
    print("\nrunning logistic regression...")
    print("using a selection threshold of {}".format(selection_threshold))
    pipe = Pipeline(
        [
            (
                "feature_selection",
                RandomizedLogisticRegression(selection_threshold=selection_threshold),
            ),
            ("classification", LogisticRegression()),
        ]
    )
    pipe.fit(X_train, y_train)
    print("training accuracy : {}".format(pipe.score(X_train, y_train)))
    print("testing accuracy : {}".format(pipe.score(X_test, y_test)))
    return pipe

Source File: test_model.py From gordo with GNU Affero General Public License v3.0

7 votes

def test_keras_autoencoder_scoring(model, kind, n_features_out):
    """
    Test the KerasAutoEncoder and KerasLSTMAutoEncoder have a working scoring function
    """
    Model = pydoc.locate(f"gordo.machine.model.models.{model}")
    model = Pipeline([("model", Model(kind=kind))])
    X = np.random.random((8, 2))

    # Should be able to deal with y output different than X input features
    y = np.random.random((8, n_features_out))

    with pytest.raises(NotFittedError):
        model.score(X, y)

    model.fit(X, y)
    score = model.score(X, y)
    logger.info(f"Score: {score:.4f}")

Source File: test_stability_selection.py From stability-selection with BSD 3-Clause "New" or "Revised" License

6 votes

def test_stability_selection_regression():
    n, p, k = 500, 1000, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)

    assert_almost_equal(important_betas, chosen_betas)

Source File: test_stability_selection.py From stability-selection with BSD 3-Clause "New" or "Revised" License

6 votes

def test_with_complementary_pairs_bootstrap():
    n, p, k = 500, 1000, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid,
                                  bootstrap_func='complementary_pairs')
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)

    assert_almost_equal(important_betas, chosen_betas)

Source File: test_stability_selection.py From stability-selection with BSD 3-Clause "New" or "Revised" License

6 votes

def test_different_shape():
    n, p, k = 100, 200, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)
    selector.transform(X[:, :-2])

Source File: test_stability_selection.py From stability-selection with BSD 3-Clause "New" or "Revised" License

6 votes

def test_no_features():
    n, p, k = 100, 200, 0

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)

    assert_almost_equal(selector.transform(X),
                        np.empty(0).reshape((X.shape[0], 0)))

Source File: data_preparation.py From healthcareai-py with MIT License

6 votes

def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True, imputeStrategy='MeanMode', tunedRandomForest=False, numeric_columns_as_categorical=None):
    """
    Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.
    
    Note advanced users may wish to use their own custom pipeline.
    """

    # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
    #   inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
    pipeline = Pipeline([
        ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
        ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)),
        # Perform one of two basic imputation methods
        # TODO we need to think about making this optional to solve the problem of rare and very predictive values
        ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical)),
        ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
        ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
        ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
        ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])),
    ])
    return pipeline

Source File: test_impute.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    X = sparse_random_matrix(100, 100, density=0.10)
    missing_values = X.data[0]

    pipeline = Pipeline([('imputer',
                          SimpleImputer(missing_values=missing_values)),
                         ('tree',
                          tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__strategy': ["mean", "median", "most_frequent"]
    }

    Y = sparse_random_matrix(100, 1, density=0.10).toarray()
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y)

Source File: 02_fit_predict_plot_employee_salaries.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

6 votes

def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [(enc + '_' + col, encoders_dict[enc], [col])
                    for col, enc in clean_columns.items()]
    # adding the encoded column
    transformers += [(encoding_method, encoders_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', RidgeCV())
    ])
    return pipeline


#########################################################################
# Fitting each encoding methods with a RidgeCV
# --------------------------------------------
# Eventually, we loop over the different encoding methods,
# instantiate each time a new pipeline, fit it
# and store the returned cross-validation score:

Source File: 03_fit_predict_plot_midwest_survey.py From dirty_cat with BSD 3-Clause "New" or "Revised" License

6 votes

def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
    # adding the encoded column
    transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', RandomForestClassifier(random_state=5))
    ])

    return pipeline


###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score:

Source File: 04_sent.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

6 votes

def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline

Source File: test_base.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_set_params_passes_all_parameters():
    # Make sure all parameters are passed together to set_params
    # of nested estimator. Regression test for #9944

    class TestDecisionTree(DecisionTreeClassifier):
        def set_params(self, **kwargs):
            super().set_params(**kwargs)
            # expected_kwargs is in test scope
            assert kwargs == expected_kwargs
            return self

    expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2}
    for est in [Pipeline([('estimator', TestDecisionTree())]),
                GridSearchCV(TestDecisionTree(), {})]:
        est.set_params(estimator__max_depth=5,
                       estimator__min_samples_leaf=2)

Source File: serializer.py From gordo with GNU Affero General Public License v3.0

6 votes

def load(source_dir: Union[os.PathLike, str]) -> Any:
    """
    Load an object from a directory, saved by
    ``gordo.serializer.pipeline_serializer.dump``

    This take a directory, which is either top-level, meaning it contains
    a sub directory in the naming scheme: "n_step=<int>-class=<path.to.Class>"
    or the aforementioned naming scheme directory directly. Will return that
    unsterilized object.


    Parameters
    ----------
    source_dir: Union[os.PathLike, str]
        Location of the top level dir the pipeline was saved

    Returns
    -------
    Union[GordoBase, Pipeline, BaseEstimator]
    """
    # This source dir should have a single pipeline entry directory.
    # may have been passed a top level dir, containing such an entry:
    with open(os.path.join(source_dir, "model.pkl"), "rb") as f:
        return pickle.load(f)

Source File: model.py From fake-news-detection with MIT License

6 votes

def run(self):
        '''
        Runs a model with params p.
        '''
        self.clf.set_params(**self.params)
        # f = get_feature_transformer(self.parser)
        # self.X_train_fts = f.fit_transform(self.X_train)
        # self.X_test_fts = f.transform(self.X_test)
        self.pipeline = Pipeline([
            # ('feature_gen', f),
            ('clf', self.clf),
        ])
        self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1]
        if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']:
            self.importances = self.clf.feature_importances_
        elif self.model_type in ['SVM', 'LR', 'SGD']:
            self.importances = self.clf.coef_[0]

Source File: data_utils.py From CalibrationNN with GNU General Public License v3.0

6 votes

def pca(self, **kwargs):
        if 'n_components' in kwargs:
            nComp = kwargs['n_components']
        else:
            nComp = 0.995

        if 'dates' in kwargs:
            mat = self.to_matrix(kwargs['dates'])
        else:
            mat = self.to_matrix()
        scaler = StandardScaler()
        pca = PCA(n_components=nComp)
        self._pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
        self._pipeline.fit(mat)
        
        if 'file' in kwargs:
            tofile(kwargs['file'], self._pipeline)
        
        return self._pipeline

Source File: test_kernel_pca.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_gridsearch_pipeline_precomputed():
    # Test if we can do a grid-search to find parameters to separate
    # circles with a perceptron model using a precomputed kernel.
    X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                        random_state=0)
    kpca = KernelPCA(kernel="precomputed", n_components=2)
    pipeline = Pipeline([("kernel_pca", kpca),
                         ("Perceptron", Perceptron(max_iter=5))])
    param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
    grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
    X_kernel = rbf_kernel(X, gamma=2.)
    grid_search.fit(X_kernel, y)
    assert_equal(grid_search.best_score_, 1)


# 0.23. warning about tol not having its correct default value.

Source File: test_data.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cv_pipeline_precomputed():
    # Cross-validate a regression on four coplanar points with the same
    # value. Use precomputed kernel to ensure Pipeline with KernelCenterer
    # is treated as a _pairwise operation.
    X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
    y_true = np.ones((4,))
    K = X.dot(X.T)
    kcent = KernelCenterer()
    pipeline = Pipeline([("kernel_centerer", kcent), ("svr",
                        SVR(gamma='scale'))])

    # did the pipeline set the _pairwise attribute?
    assert pipeline._pairwise

    # test cross-validation, score should be almost perfect
    # NB: this test is pretty vacuous -- it's mainly to test integration
    #     of Pipeline and KernelCenterer
    y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
    assert_array_almost_equal(y_true, y_pred)

Source File: common_utils.py From interpret-text with MIT License

6 votes

def create_logistic_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LogisticRegression(random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("lr", lr)])

Source File: skl_utils.py From kaggle-HomeDepot with MIT License

5 votes

def __init__(self, n_neighbors=5, weights='uniform', leaf_size=30, 
                metric='minkowski', normalize=True):
        if metric == 'cosine':
            metric = lambda x,y: dist_utils._cosine_sim(x, y)
        knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, 
            leaf_size=leaf_size, metric=metric)
        if normalize:
            self.model = Pipeline([('ss', StandardScaler()), ('knn', knn)])
        else:
            self.model = knn

Source File: image-classification.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

5 votes

def accuracy(features, labels):
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn import cross_validation
    # We use logistic regression because it is very fast.
    # Feel free to experiment with other classifiers
    clf = Pipeline([('preproc', StandardScaler()),
                ('classifier', LogisticRegression())])
    cv = cross_validation.LeaveOneOut(len(features))
    scores = cross_validation.cross_val_score(
        clf, features, labels, cv=cv)
    return scores.mean()

Source File: test_search.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_grid_search_allows_nans():
    # Test GridSearchCV with SimpleImputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]
    p = Pipeline([
        ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
        ('classifier', MockClassifier()),
    ])
    GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y)

Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_ovr_pipeline():
    # Test with pipeline of length one
    # This test is needed because the multiclass estimators may fail to detect
    # the presence of predict_proba or decision_function.
    clf = Pipeline([("tree", DecisionTreeClassifier())])
    ovr_pipe = OneVsRestClassifier(clf)
    ovr_pipe.fit(iris.data, iris.target)
    ovr = OneVsRestClassifier(DecisionTreeClassifier())
    ovr.fit(iris.data, iris.target)
    assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data))

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_countvectorizer_custom_vocabulary_pipeline():
    what_we_like = ["pizza", "beer"]
    pipe = Pipeline([
        ('count', CountVectorizer(vocabulary=what_we_like)),
        ('tfidf', TfidfTransformer())])
    X = pipe.fit_transform(ALL_FOOD_DOCS)
    assert_equal(set(pipe.named_steps['count'].vocabulary_),
                 set(what_we_like))
    assert_equal(X.shape[1], len(what_we_like))

Source File: test_base.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_is_classifier():
    svc = SVC()
    assert is_classifier(svc)
    assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]}))
    assert is_classifier(Pipeline([('svc', svc)]))
    assert is_classifier(Pipeline(
        [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))]))

Source File: skl_utils.py From kaggle-HomeDepot with MIT License

5 votes

def __init__(self, epsilon=0.0, C=1.0, loss='epsilon_insensitive', 
                random_state=None, normalize=True):
        lsvr = sklearn.svm.LinearSVR(epsilon=epsilon, C=C, 
                    loss=loss, random_state=random_state)
        if normalize:
            self.model = Pipeline([('ss', StandardScaler()), ('lsvr', lsvr)])
        else:
            self.model = lsvr

Source File: skl_utils.py From kaggle-HomeDepot with MIT License

5 votes

def __init__(self, kernel='rbf', degree=3, gamma='auto', C=1.0, 
                epsilon=0.1, normalize=True, cache_size=2048):
        svr = sklearn.svm.SVR(kernel=kernel, degree=degree, 
                            gamma=gamma, C=C, epsilon=epsilon)
        if normalize:
            self.model = Pipeline([('ss', StandardScaler()), ('svr', svr)])
        else:
            self.model = svr

Source File: serializer.py From gordo with GNU Affero General Public License v3.0

5 votes

def loads(bytes_object: bytes) -> GordoBase:
    """
    Load a GordoBase model from bytes dumped from ``gordo.serializer.dumps``

    Parameters
    ----------
    bytes_object: bytes
        Bytes to be loaded, should be the result of `serializer.dumps(model)`

    Returns
    -------
    Union[GordoBase, Pipeline, BaseEstimator]
        Custom gordo model, scikit learn pipeline or other scikit learn like object.
    """
    return pickle.loads(bytes_object)

Source File: serializer.py From gordo with GNU Affero General Public License v3.0

5 votes

def dumps(model: Union[Pipeline, GordoBase]) -> bytes:
    """
    Dump a model into a bytes representation suitable for loading from
    ``gordo.serializer.loads``

    Parameters
    ----------
    model: Union[Pipeline, GordoBase]
        A gordo model/pipeline

    Returns
    -------
    bytes
        Serialized model which supports loading via ``serializer.loads()``

    Example
    -------
    >>> from gordo.machine.model.models import KerasAutoEncoder
    >>> from gordo import serializer
    >>>
    >>> model = KerasAutoEncoder('feedforward_symmetric')
    >>> serialized = serializer.dumps(model)
    >>> assert isinstance(serialized, bytes)
    >>>
    >>> model_clone = serializer.loads(serialized)
    >>> assert isinstance(model_clone, KerasAutoEncoder)
    """
    return pickle.dumps(model)

Source File: from_definition.py From gordo with GNU Affero General Public License v3.0

5 votes

def _build_scikit_branch(
    definition: Iterable[Union[str, Dict[Any, Any]]],
    constructor_class=Union[Pipeline, None],
):
    """
    Exactly like :func:`~_build_branch` except it's expected this is going to
    be a list of tuples, where the 0th element is the name of the step.
    """
    steps = [(f"step_{i}", _build_step(step)) for i, step in enumerate(definition)]
    return steps if constructor_class is None else constructor_class(steps)

Source File: model_io.py From gordo with GNU Affero General Public License v3.0

5 votes

def get_model_output(model: Pipeline, X: np.ndarray) -> np.ndarray:
    """
    Get the raw output from the current model given X.
    Will try to `predict` and then `transform`, raising an error
    if both fail.

    Parameters
    ----------
    X: np.ndarray
        2d array of sample(s)

    Returns
    -------
    np.ndarray
        The raw output of the model in numpy array form.
    """
    try:
        return model.predict(X)  # type: ignore

    # Model may only be a transformer
    except AttributeError:
        try:
            return model.transform(X)  # type: ignore
        except Exception as exc:
            logger.error(f"Failed to predict or transform; error: {exc}")
            raise

Python sklearn.pipeline.Pipeline() Examples