Python sklearn.pipeline.Pipeline() Examples

The following are 30 code examples of sklearn.pipeline.Pipeline(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.pipeline , or try the search function .
Example #1
Source File: create_ngrams.py    From rasa_lookup_demo with Apache License 2.0 8 votes vote down vote up
def run_logreg(X_train, y_train, selection_threshold=0.2):
    print("\nrunning logistic regression...")
    print("using a selection threshold of {}".format(selection_threshold))
    pipe = Pipeline(
        [
            (
                "feature_selection",
                RandomizedLogisticRegression(selection_threshold=selection_threshold),
            ),
            ("classification", LogisticRegression()),
        ]
    )
    pipe.fit(X_train, y_train)
    print("training accuracy : {}".format(pipe.score(X_train, y_train)))
    print("testing accuracy : {}".format(pipe.score(X_test, y_test)))
    return pipe 
Example #2
Source File: test_model.py    From gordo with GNU Affero General Public License v3.0 7 votes vote down vote up
def test_keras_autoencoder_scoring(model, kind, n_features_out):
    """
    Test the KerasAutoEncoder and KerasLSTMAutoEncoder have a working scoring function
    """
    Model = pydoc.locate(f"gordo.machine.model.models.{model}")
    model = Pipeline([("model", Model(kind=kind))])
    X = np.random.random((8, 2))

    # Should be able to deal with y output different than X input features
    y = np.random.random((8, n_features_out))

    with pytest.raises(NotFittedError):
        model.score(X, y)

    model.fit(X, y)
    score = model.score(X, y)
    logger.info(f"Score: {score:.4f}") 
Example #3
Source File: test_stability_selection.py    From stability-selection with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_stability_selection_regression():
    n, p, k = 500, 1000, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)

    assert_almost_equal(important_betas, chosen_betas) 
Example #4
Source File: test_stability_selection.py    From stability-selection with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_with_complementary_pairs_bootstrap():
    n, p, k = 500, 1000, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid,
                                  bootstrap_func='complementary_pairs')
    selector.fit(X, y)

    chosen_betas = selector.get_support(indices=True)

    assert_almost_equal(important_betas, chosen_betas) 
Example #5
Source File: test_stability_selection.py    From stability-selection with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_different_shape():
    n, p, k = 100, 200, 5

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)
    selector.transform(X[:, :-2]) 
Example #6
Source File: test_stability_selection.py    From stability-selection with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_no_features():
    n, p, k = 100, 200, 0

    X, y, important_betas = _generate_dummy_regression_data(n=n, k=k)

    base_estimator = Pipeline([
        ('scaler', StandardScaler()),
        ('model', Lasso())
    ])

    lambdas_grid = np.logspace(-1, 1, num=10)

    selector = StabilitySelection(base_estimator=base_estimator,
                                  lambda_name='model__alpha',
                                  lambda_grid=lambdas_grid)
    selector.fit(X, y)

    assert_almost_equal(selector.transform(X),
                        np.empty(0).reshape((X.shape[0], 0))) 
Example #7
Source File: data_preparation.py    From healthcareai-py with MIT License 6 votes vote down vote up
def full_pipeline(model_type, predicted_column, grain_column, impute=True, verbose=True, imputeStrategy='MeanMode', tunedRandomForest=False, numeric_columns_as_categorical=None):
    """
    Builds the data preparation pipeline. Sequentially runs transformers and filters to clean and prepare the data.
    
    Note advanced users may wish to use their own custom pipeline.
    """

    # Note: this could be done more elegantly using FeatureUnions _if_ you are not using pandas dataframes for
    #   inputs of the later pipelines as FeatureUnion intrinsically converts outputs to numpy arrays.
    pipeline = Pipeline([
        ('remove_DTS_columns', hcai_filters.DataframeColumnSuffixFilter()),
        ('remove_grain_column', hcai_filters.DataframeColumnRemover(grain_column)),
        # Perform one of two basic imputation methods
        # TODO we need to think about making this optional to solve the problem of rare and very predictive values
        ('imputation', hcai_transformers.DataFrameImputer(impute=impute, verbose=verbose, imputeStrategy=imputeStrategy, tunedRandomForest=tunedRandomForest, numeric_columns_as_categorical=numeric_columns_as_categorical)),
        ('null_row_filter', hcai_filters.DataframeNullValueFilter(excluded_columns=None)),
        ('convert_target_to_binary', hcai_transformers.DataFrameConvertTargetToBinary(model_type, predicted_column)),
        ('prediction_to_numeric', hcai_transformers.DataFrameConvertColumnToNumeric(predicted_column)),
        ('create_dummy_variables', hcai_transformers.DataFrameCreateDummyVariables(excluded_columns=[predicted_column])),
    ])
    return pipeline 
Example #8
Source File: test_impute.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_imputation_pipeline_grid_search():
    # Test imputation within a pipeline + gridsearch.
    X = sparse_random_matrix(100, 100, density=0.10)
    missing_values = X.data[0]

    pipeline = Pipeline([('imputer',
                          SimpleImputer(missing_values=missing_values)),
                         ('tree',
                          tree.DecisionTreeRegressor(random_state=0))])

    parameters = {
        'imputer__strategy': ["mean", "median", "most_frequent"]
    }

    Y = sparse_random_matrix(100, 1, density=0.10).toarray()
    gs = GridSearchCV(pipeline, parameters)
    gs.fit(X, Y) 
Example #9
Source File: 02_fit_predict_plot_employee_salaries.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [(enc + '_' + col, encoders_dict[enc], [col])
                    for col, enc in clean_columns.items()]
    # adding the encoded column
    transformers += [(encoding_method, encoders_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('clf', RidgeCV())
    ])
    return pipeline


#########################################################################
# Fitting each encoding methods with a RidgeCV
# --------------------------------------------
# Eventually, we loop over the different encoding methods,
# instantiate each time a new pipeline, fit it
# and store the returned cross-validation score: 
Example #10
Source File: 03_fit_predict_plot_midwest_survey.py    From dirty_cat with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def make_pipeline(encoding_method):
    # static transformers from the other columns
    transformers = [('one-hot-clean', encoder_dict['one-hot'], clean_columns)]
    # adding the encoded column
    transformers += [(encoding_method + '-dirty', encoder_dict[encoding_method],
                      [dirty_column])]
    pipeline = Pipeline([
        # Use ColumnTransformer to combine the features
        ('union', ColumnTransformer(
            transformers=transformers,
            remainder='drop')),
        ('scaler', StandardScaler(with_mean=False)),
        ('classifier', RandomForestClassifier(random_state=5))
    ])

    return pipeline


###############################################################################
# Evaluation of different encoding methods
# -----------------------------------------
# We then loop over encoding methods, scoring the different pipeline predictions
# using a cross validation score: 
Example #11
Source File: 04_sent.py    From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License 6 votes vote down vote up
def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline 
Example #12
Source File: test_base.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_set_params_passes_all_parameters():
    # Make sure all parameters are passed together to set_params
    # of nested estimator. Regression test for #9944

    class TestDecisionTree(DecisionTreeClassifier):
        def set_params(self, **kwargs):
            super().set_params(**kwargs)
            # expected_kwargs is in test scope
            assert kwargs == expected_kwargs
            return self

    expected_kwargs = {'max_depth': 5, 'min_samples_leaf': 2}
    for est in [Pipeline([('estimator', TestDecisionTree())]),
                GridSearchCV(TestDecisionTree(), {})]:
        est.set_params(estimator__max_depth=5,
                       estimator__min_samples_leaf=2) 
Example #13
Source File: serializer.py    From gordo with GNU Affero General Public License v3.0 6 votes vote down vote up
def load(source_dir: Union[os.PathLike, str]) -> Any:
    """
    Load an object from a directory, saved by
    ``gordo.serializer.pipeline_serializer.dump``

    This take a directory, which is either top-level, meaning it contains
    a sub directory in the naming scheme: "n_step=<int>-class=<path.to.Class>"
    or the aforementioned naming scheme directory directly. Will return that
    unsterilized object.


    Parameters
    ----------
    source_dir: Union[os.PathLike, str]
        Location of the top level dir the pipeline was saved

    Returns
    -------
    Union[GordoBase, Pipeline, BaseEstimator]
    """
    # This source dir should have a single pipeline entry directory.
    # may have been passed a top level dir, containing such an entry:
    with open(os.path.join(source_dir, "model.pkl"), "rb") as f:
        return pickle.load(f) 
Example #14
Source File: model.py    From fake-news-detection with MIT License 6 votes vote down vote up
def run(self):
        '''
        Runs a model with params p.
        '''
        self.clf.set_params(**self.params)
        # f = get_feature_transformer(self.parser)
        # self.X_train_fts = f.fit_transform(self.X_train)
        # self.X_test_fts = f.transform(self.X_test)
        self.pipeline = Pipeline([
            # ('feature_gen', f),
            ('clf', self.clf),
        ])
        self.y_pred_probs = self.pipeline.fit(self.X_train,self.y_train).predict_proba(self.X_test)[:,1]
        if self.model_type in ['RF', 'ET', 'AB', 'GB', 'DT']:
            self.importances = self.clf.feature_importances_
        elif self.model_type in ['SVM', 'LR', 'SGD']:
            self.importances = self.clf.coef_[0] 
Example #15
Source File: data_utils.py    From CalibrationNN with GNU General Public License v3.0 6 votes vote down vote up
def pca(self, **kwargs):
        if 'n_components' in kwargs:
            nComp = kwargs['n_components']
        else:
            nComp = 0.995

        if 'dates' in kwargs:
            mat = self.to_matrix(kwargs['dates'])
        else:
            mat = self.to_matrix()
        scaler = StandardScaler()
        pca = PCA(n_components=nComp)
        self._pipeline = Pipeline([('scaler', scaler), ('pca', pca)])
        self._pipeline.fit(mat)
        
        if 'file' in kwargs:
            tofile(kwargs['file'], self._pipeline)
        
        return self._pipeline 
Example #16
Source File: test_kernel_pca.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_gridsearch_pipeline_precomputed():
    # Test if we can do a grid-search to find parameters to separate
    # circles with a perceptron model using a precomputed kernel.
    X, y = make_circles(n_samples=400, factor=.3, noise=.05,
                        random_state=0)
    kpca = KernelPCA(kernel="precomputed", n_components=2)
    pipeline = Pipeline([("kernel_pca", kpca),
                         ("Perceptron", Perceptron(max_iter=5))])
    param_grid = dict(Perceptron__max_iter=np.arange(1, 5))
    grid_search = GridSearchCV(pipeline, cv=3, param_grid=param_grid)
    X_kernel = rbf_kernel(X, gamma=2.)
    grid_search.fit(X_kernel, y)
    assert_equal(grid_search.best_score_, 1)


# 0.23. warning about tol not having its correct default value. 
Example #17
Source File: test_data.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_cv_pipeline_precomputed():
    # Cross-validate a regression on four coplanar points with the same
    # value. Use precomputed kernel to ensure Pipeline with KernelCenterer
    # is treated as a _pairwise operation.
    X = np.array([[3, 0, 0], [0, 3, 0], [0, 0, 3], [1, 1, 1]])
    y_true = np.ones((4,))
    K = X.dot(X.T)
    kcent = KernelCenterer()
    pipeline = Pipeline([("kernel_centerer", kcent), ("svr",
                        SVR(gamma='scale'))])

    # did the pipeline set the _pairwise attribute?
    assert pipeline._pairwise

    # test cross-validation, score should be almost perfect
    # NB: this test is pretty vacuous -- it's mainly to test integration
    #     of Pipeline and KernelCenterer
    y_pred = cross_val_predict(pipeline, K, y_true, cv=2)
    assert_array_almost_equal(y_true, y_pred) 
Example #18
Source File: common_utils.py    From interpret-text with MIT License 6 votes vote down vote up
def create_logistic_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LogisticRegression(random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("lr", lr)]) 
Example #19
Source File: skl_utils.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def __init__(self, n_neighbors=5, weights='uniform', leaf_size=30, 
                metric='minkowski', normalize=True):
        if metric == 'cosine':
            metric = lambda x,y: dist_utils._cosine_sim(x, y)
        knn = sklearn.neighbors.KNeighborsRegressor(n_neighbors=n_neighbors, weights=weights, 
            leaf_size=leaf_size, metric=metric)
        if normalize:
            self.model = Pipeline([('ss', StandardScaler()), ('knn', knn)])
        else:
            self.model = knn 
Example #20
Source File: image-classification.py    From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License 5 votes vote down vote up
def accuracy(features, labels):
    from sklearn.linear_model import LogisticRegression
    from sklearn.pipeline import Pipeline
    from sklearn.preprocessing import StandardScaler
    from sklearn import cross_validation
    # We use logistic regression because it is very fast.
    # Feel free to experiment with other classifiers
    clf = Pipeline([('preproc', StandardScaler()),
                ('classifier', LogisticRegression())])
    cv = cross_validation.LeaveOneOut(len(features))
    scores = cross_validation.cross_val_score(
        clf, features, labels, cv=cv)
    return scores.mean() 
Example #21
Source File: test_search.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_grid_search_allows_nans():
    # Test GridSearchCV with SimpleImputer
    X = np.arange(20, dtype=np.float64).reshape(5, -1)
    X[2, :] = np.nan
    y = [0, 0, 1, 1, 1]
    p = Pipeline([
        ('imputer', SimpleImputer(strategy='mean', missing_values=np.nan)),
        ('classifier', MockClassifier()),
    ])
    GridSearchCV(p, {'classifier__foo_param': [1, 2, 3]}, cv=2).fit(X, y) 
Example #22
Source File: test_multiclass.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_ovr_pipeline():
    # Test with pipeline of length one
    # This test is needed because the multiclass estimators may fail to detect
    # the presence of predict_proba or decision_function.
    clf = Pipeline([("tree", DecisionTreeClassifier())])
    ovr_pipe = OneVsRestClassifier(clf)
    ovr_pipe.fit(iris.data, iris.target)
    ovr = OneVsRestClassifier(DecisionTreeClassifier())
    ovr.fit(iris.data, iris.target)
    assert_array_equal(ovr.predict(iris.data), ovr_pipe.predict(iris.data)) 
Example #23
Source File: test_text.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_countvectorizer_custom_vocabulary_pipeline():
    what_we_like = ["pizza", "beer"]
    pipe = Pipeline([
        ('count', CountVectorizer(vocabulary=what_we_like)),
        ('tfidf', TfidfTransformer())])
    X = pipe.fit_transform(ALL_FOOD_DOCS)
    assert_equal(set(pipe.named_steps['count'].vocabulary_),
                 set(what_we_like))
    assert_equal(X.shape[1], len(what_we_like)) 
Example #24
Source File: test_base.py    From Mastering-Elasticsearch-7.0 with MIT License 5 votes vote down vote up
def test_is_classifier():
    svc = SVC()
    assert is_classifier(svc)
    assert is_classifier(GridSearchCV(svc, {'C': [0.1, 1]}))
    assert is_classifier(Pipeline([('svc', svc)]))
    assert is_classifier(Pipeline(
        [('svc_cv', GridSearchCV(svc, {'C': [0.1, 1]}))])) 
Example #25
Source File: skl_utils.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def __init__(self, epsilon=0.0, C=1.0, loss='epsilon_insensitive', 
                random_state=None, normalize=True):
        lsvr = sklearn.svm.LinearSVR(epsilon=epsilon, C=C, 
                    loss=loss, random_state=random_state)
        if normalize:
            self.model = Pipeline([('ss', StandardScaler()), ('lsvr', lsvr)])
        else:
            self.model = lsvr 
Example #26
Source File: skl_utils.py    From kaggle-HomeDepot with MIT License 5 votes vote down vote up
def __init__(self, kernel='rbf', degree=3, gamma='auto', C=1.0, 
                epsilon=0.1, normalize=True, cache_size=2048):
        svr = sklearn.svm.SVR(kernel=kernel, degree=degree, 
                            gamma=gamma, C=C, epsilon=epsilon)
        if normalize:
            self.model = Pipeline([('ss', StandardScaler()), ('svr', svr)])
        else:
            self.model = svr 
Example #27
Source File: serializer.py    From gordo with GNU Affero General Public License v3.0 5 votes vote down vote up
def loads(bytes_object: bytes) -> GordoBase:
    """
    Load a GordoBase model from bytes dumped from ``gordo.serializer.dumps``

    Parameters
    ----------
    bytes_object: bytes
        Bytes to be loaded, should be the result of `serializer.dumps(model)`

    Returns
    -------
    Union[GordoBase, Pipeline, BaseEstimator]
        Custom gordo model, scikit learn pipeline or other scikit learn like object.
    """
    return pickle.loads(bytes_object) 
Example #28
Source File: serializer.py    From gordo with GNU Affero General Public License v3.0 5 votes vote down vote up
def dumps(model: Union[Pipeline, GordoBase]) -> bytes:
    """
    Dump a model into a bytes representation suitable for loading from
    ``gordo.serializer.loads``

    Parameters
    ----------
    model: Union[Pipeline, GordoBase]
        A gordo model/pipeline

    Returns
    -------
    bytes
        Serialized model which supports loading via ``serializer.loads()``

    Example
    -------
    >>> from gordo.machine.model.models import KerasAutoEncoder
    >>> from gordo import serializer
    >>>
    >>> model = KerasAutoEncoder('feedforward_symmetric')
    >>> serialized = serializer.dumps(model)
    >>> assert isinstance(serialized, bytes)
    >>>
    >>> model_clone = serializer.loads(serialized)
    >>> assert isinstance(model_clone, KerasAutoEncoder)
    """
    return pickle.dumps(model) 
Example #29
Source File: from_definition.py    From gordo with GNU Affero General Public License v3.0 5 votes vote down vote up
def _build_scikit_branch(
    definition: Iterable[Union[str, Dict[Any, Any]]],
    constructor_class=Union[Pipeline, None],
):
    """
    Exactly like :func:`~_build_branch` except it's expected this is going to
    be a list of tuples, where the 0th element is the name of the step.
    """
    steps = [(f"step_{i}", _build_step(step)) for i, step in enumerate(definition)]
    return steps if constructor_class is None else constructor_class(steps) 
Example #30
Source File: model_io.py    From gordo with GNU Affero General Public License v3.0 5 votes vote down vote up
def get_model_output(model: Pipeline, X: np.ndarray) -> np.ndarray:
    """
    Get the raw output from the current model given X.
    Will try to `predict` and then `transform`, raising an error
    if both fail.

    Parameters
    ----------
    X: np.ndarray
        2d array of sample(s)

    Returns
    -------
    np.ndarray
        The raw output of the model in numpy array form.
    """
    try:
        return model.predict(X)  # type: ignore

    # Model may only be a transformer
    except AttributeError:
        try:
            return model.transform(X)  # type: ignore
        except Exception as exc:
            logger.error(f"Failed to predict or transform; error: {exc}")
            raise