Python Examples of sklearn.linear

Source File: _validateSchema.py From nyoka with Apache License 2.0

7 votes

def test_validate_sklearn_sgd_with_text_cv(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_CountVec_.pmml'
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True)

Source File: _validateSchema.py From nyoka with Apache License 2.0

6 votes

def test_validate_sklearn_sgd_with_text(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_TfIdfVec_.pmml'
        pipeline = Pipeline([
            ('vect', TfidfVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True)

Source File: test_multioutput.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_multi_output_classification_partial_fit_parallelism():
    sgd_linear_clf = SGDClassifier(loss='log', random_state=1, max_iter=5)
    mor = MultiOutputClassifier(sgd_linear_clf, n_jobs=4)
    mor.partial_fit(X, y, classes)
    est1 = mor.estimators_[0]
    mor.partial_fit(X, y)
    est2 = mor.estimators_[0]
    if cpu_count() > 1:
        # parallelism requires this to be the case for a sane implementation
        assert est1 is not est2


# check predict_proba passes

Source File: test_sgd.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_sgd_predict_proba_method_access(klass):
    # Checks that SGDClassifier predict_proba and predict_log_proba methods
    # can either be accessed or raise an appropriate error message
    # otherwise. See
    # https://github.com/scikit-learn/scikit-learn/issues/10938 for more
    # details.
    for loss in linear_model.SGDClassifier.loss_functions:
        clf = SGDClassifier(loss=loss)
        if loss in ('log', 'modified_huber'):
            assert hasattr(clf, 'predict_proba')
            assert hasattr(clf, 'predict_log_proba')
        else:
            message = ("probability estimates are not "
                       "available for loss={!r}".format(loss))
            assert not hasattr(clf, 'predict_proba')
            assert not hasattr(clf, 'predict_log_proba')
            with pytest.raises(AttributeError,
                               match=message):
                clf.predict_proba
            with pytest.raises(AttributeError,
                               match=message):
                clf.predict_log_proba

Source File: test_sgd.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_l1_ratio():
    # Test if l1 ratio extremes match L1 and L2 penalty settings.
    X, y = datasets.make_classification(n_samples=1000,
                                        n_features=100, n_informative=20,
                                        random_state=1234)

    # test if elasticnet with l1_ratio near 1 gives same result as pure l1
    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
                           max_iter=6, l1_ratio=0.9999999999,
                           random_state=42).fit(X, y)
    est_l1 = SGDClassifier(alpha=0.001, penalty='l1', max_iter=6,
                           random_state=42, tol=None).fit(X, y)
    assert_array_almost_equal(est_en.coef_, est_l1.coef_)

    # test if elasticnet with l1_ratio near 0 gives same result as pure l2
    est_en = SGDClassifier(alpha=0.001, penalty='elasticnet', tol=None,
                           max_iter=6, l1_ratio=0.0000000001,
                           random_state=42).fit(X, y)
    est_l2 = SGDClassifier(alpha=0.001, penalty='l2', max_iter=6,
                           random_state=42, tol=None).fit(X, y)
    assert_array_almost_equal(est_en.coef_, est_l2.coef_)

Source File: test_from_model.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_prefit():
    # Test all possible combinations of the prefit parameter.

    # Passing a prefit parameter with the selected model
    # and fitting a unfit model with prefit=False should give same results.
    clf = SGDClassifier(alpha=0.1, max_iter=10, shuffle=True,
                        random_state=0, tol=None)
    model = SelectFromModel(clf)
    model.fit(data, y)
    X_transform = model.transform(data)
    clf.fit(data, y)
    model = SelectFromModel(clf, prefit=True)
    assert_array_almost_equal(model.transform(data), X_transform)

    # Check that the model is rewritten if prefit=False and a fitted model is
    # passed
    model = SelectFromModel(clf, prefit=False)
    model.fit(data, y)
    assert_array_almost_equal(model.transform(data), X_transform)

    # Check that prefit=True and calling fit raises a ValueError
    model = SelectFromModel(clf, prefit=True)
    assert_raises(ValueError, model.fit, data, y)

Source File: SGDClassifier.py From Splunking-Crime with GNU Affero General Public License v3.0

6 votes

def __init__(self, options):
        self.handle_options(options)

        out_params = convert_params(
            options.get('params', {}),
            bools=['fit_intercept'],
            ints=['random_state', 'n_iter'],
            floats=['l1_ratio', 'alpha', 'eta0', 'power_t'],
            strs=['loss', 'penalty', 'learning_rate'],
        )

        if 'loss' in out_params:
            try:
                assert (out_params['loss'] in ['hinge', 'log', 'modified_huber', 'squared_hinge', 'perceptron'])
            except AssertionError:
                raise RuntimeError(
                    'Value for parameter "loss" has to be one of "hinge", "log", "modified_huber", "squared_hinge", or "perceptron"')

        self.scaler = StandardScaler()
        self.estimator = _SGDClassifier(**out_params)

Source File: linear_trainers.py From ReAgent with BSD 3-Clause "New" or "Revised" License

6 votes

def train(self, data: TrainingData, iterations: int = 1, num_samples: int = 0):
        logging.info("SGDClassifierTrainer.train...")
        self._model = None
        best_score = float("-inf")
        for _ in range(iterations):
            x, y, _ = super()._sample(
                data.train_x, data.train_y, data.train_weight, num_samples, True
            )
            sx, sy, ssw = super()._sample(
                data.validation_x, data.validation_y, data.validation_weight
            )
            for alpha in np.logspace(-8, -1, num=8, base=10):
                model = SGDClassifier(
                    loss=self._loss,
                    alpha=alpha,
                    random_state=0,
                    max_iter=self._max_iter,
                )
                model.fit(x, y)
                score = model.score(sx, sy, ssw)
                logging.info(f"  alpha: {alpha}, score: {score}")
                if score > best_score:
                    best_score = score
                    self._model = model

Source File: ensemble_glm.py From jh-kaggle-util with Apache License 2.0

6 votes

def fit_ensemble(x,y):
    fit_type = jhkaggle.jhkaggle_config['FIT_TYPE']
    if 1:
        if fit_type == jhkaggle.const.FIT_TYPE_BINARY_CLASSIFICATION:
            blend = SGDClassifier(loss="log", penalty="elasticnet")  # LogisticRegression()
        else:
            # blend = SGDRegressor()
            #blend = LinearRegression()
            #blend = RandomForestRegressor(n_estimators=10, n_jobs=-1, max_depth=5, criterion='mae')
            blend = LassoLarsCV(normalize=True)
            #blend = ElasticNetCV(normalize=True)
            #blend = LinearRegression(normalize=True)
        blend.fit(x, y)
    else:
        blend = LogisticRegression()
        blend.fit(x, y)


    return blend

Source File: test_multioutput.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_multi_output_predict_proba():
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3)
    param = {'loss': ('hinge', 'log', 'modified_huber')}

    # inner function for custom scoring
    def custom_scorer(estimator, X, y):
        if hasattr(estimator, "predict_proba"):
            return 1.0
        else:
            return 0.0
    grid_clf = GridSearchCV(sgd_linear_clf, param_grid=param,
                            scoring=custom_scorer, cv=3, error_score=np.nan)
    multi_target_linear = MultiOutputClassifier(grid_clf)
    multi_target_linear.fit(X, y)

    multi_target_linear.predict_proba(X)

    # SGDClassifier defaults to loss='hinge' which is not a probabilistic
    # loss function; therefore it does not expose a predict_proba method
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=5, tol=1e-3)
    multi_target_linear = MultiOutputClassifier(sgd_linear_clf)
    multi_target_linear.fit(X, y)
    err_msg = "The base estimator should implement predict_proba method"
    with pytest.raises(ValueError, match=err_msg):
        multi_target_linear.predict_proba(X)


# 0.23. warning about tol not having its correct default value.

Source File: test_multioutput.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_multi_output_classification_partial_fit_sample_weights():
    # weighted classifier
    Xw = [[1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    yw = [[3, 2], [2, 3], [3, 2]]
    w = np.asarray([2., 1., 1.])
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
    clf_w = MultiOutputClassifier(sgd_linear_clf)
    clf_w.fit(Xw, yw, w)

    # unweighted, but with repeated samples
    X = [[1, 2, 3], [1, 2, 3], [4, 5, 6], [1.5, 2.5, 3.5]]
    y = [[3, 2], [3, 2], [2, 3], [3, 2]]
    sgd_linear_clf = SGDClassifier(random_state=1, max_iter=20)
    clf = MultiOutputClassifier(sgd_linear_clf)
    clf.fit(X, y)
    X_test = [[1.5, 2.5, 3.5]]
    assert_array_almost_equal(clf.predict(X_test), clf_w.predict(X_test))

Source File: _test_file_stream_multiple_cfier.py From scikit-multiflow with BSD 3-Clause "New" or "Revised" License

6 votes

def demo():

    # The classifier we will use (other options: SAMKNNClassifier, LeveragingBaggingClassifier, SGD)
    h1 = [HoeffdingTreeClassifier(), SAMKNNClassifier(), LeveragingBaggingClassifier(random_state=1), SGDClassifier()]
    h2 = [HoeffdingTreeClassifier(), SAMKNNClassifier(), LeveragingBaggingClassifier(random_state=1), SGDClassifier()]
    h3 = [HoeffdingTreeClassifier(), SAMKNNClassifier(), LeveragingBaggingClassifier(random_state=1), SGDClassifier()]
    model_names = ['HT', 'SAMKNNClassifier', 'LBkNN', 'SGDC']

    # Demo 1 -- plot should not fail
    demo_parameterized(h1, model_names=model_names)

    # Demo 2 -- csv output should look nice
    demo_parameterized(h2, "sea_stream.csv", False, model_names)

    # Demo 3 -- should not give "'NoneType' object is not iterable" error
    demo_parameterized(h3, "covtype.csv", False, model_names)

Source File: test_robust_weighted_estimator.py From scikit-learn-extra with BSD 3-Clause "New" or "Revised" License

6 votes

def test_predict_proba(weighting):
    clf = RobustWeightedEstimator(
        SGDClassifier(loss="log"),
        loss="log",
        max_iter=100,
        weighting=weighting,
        k=0,
        c=1e7,
        burn_in=0,
        random_state=rng,
    )
    clf_not_rob = SGDClassifier(loss="log", random_state=rng)
    clf.fit(X_c, y_c)
    clf_not_rob.fit(X_c, y_c)
    pred1 = clf.base_estimator_.predict_proba(X_c)[:, 1]
    pred2 = clf_not_rob.predict_proba(X_c)[:, 1]

    assert (
        np.linalg.norm(pred1 - pred2) / np.linalg.norm(pred2)
        - np.linalg.norm(pred1 - y_c) / np.linalg.norm(y_c)
        < 0.1
    )


# Regression test with outliers

Source File: test_robust_weighted_estimator.py From scikit-learn-extra with BSD 3-Clause "New" or "Revised" License

6 votes

def test_not_robust_classif(loss, weighting):
    clf = RobustWeightedEstimator(
        SGDClassifier(),
        loss=loss,
        max_iter=100,
        weighting=weighting,
        k=0,
        c=1e7,
        burn_in=0,
        random_state=rng,
    )
    clf_not_rob = SGDClassifier(loss=loss, random_state=rng)
    clf.fit(X_c, y_c)
    clf_not_rob.fit(X_c, y_c)
    pred1 = clf.base_estimator_.decision_function(X_c)
    pred2 = clf_not_rob.decision_function(X_c)

    assert (
        np.linalg.norm(pred1 - pred2) / np.linalg.norm(pred2)
        - np.linalg.norm(pred1 - y_c) / np.linalg.norm(y_c)
        < 0.1
    )


# Case "log" loss, test predict_proba

Source File: test_robust_weighted_estimator.py From scikit-learn-extra with BSD 3-Clause "New" or "Revised" License

6 votes

def test_corrupted_classif(loss, weighting):
    clf = RobustWeightedEstimator(
        SGDClassifier(),
        loss=loss,
        max_iter=50,
        weighting=weighting,
        k=5,
        c=None,
        random_state=rng,
    )
    clf.fit(X_cc, y_cc)
    score = clf.score(X_cc, y_cc)
    assert score > 0.75


# Classification test without outliers

Source File: test_logistic.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_elastic_net_versus_sgd(C, l1_ratio):
    # Compare elasticnet penalty in LogisticRegression() and SGD(loss='log')
    n_samples = 500
    X, y = make_classification(n_samples=n_samples, n_classes=2, n_features=5,
                               n_informative=5, n_redundant=0, n_repeated=0,
                               random_state=1)
    X = scale(X)

    sgd = SGDClassifier(
        penalty='elasticnet', random_state=1, fit_intercept=False, tol=-np.inf,
        max_iter=2000, l1_ratio=l1_ratio, alpha=1. / C / n_samples, loss='log')
    log = LogisticRegression(
        penalty='elasticnet', random_state=1, fit_intercept=False, tol=1e-5,
        max_iter=1000, l1_ratio=l1_ratio, C=C, solver='saga')

    sgd.fit(X, y)
    log.fit(X, y)
    assert_array_almost_equal(sgd.coef_, log.coef_, decimal=1)

Source File: text_classifier.py From textar with MIT License

6 votes

def make_classifier(self, name, ids, labels):
        """Entrenar un clasificador SVM sobre los textos cargados.

        Crea un clasificador que se guarda en el objeto bajo el nombre `name`.

        Args:
            name (str): Nombre para el clasidicador.
            ids (list): Se espera una lista de N ids de textos ya almacenados
                en el TextClassifier.
            labels (list): Se espera una lista de N etiquetas. Una por cada id
                de texto presente en ids.
        Nota:
            Usa el clasificador de `Scikit-learn <http://scikit-learn.org/>`_
        """
        if not all(np.in1d(ids, self.ids)):
            raise ValueError("Hay ids de textos que no se encuentran \
                              almacenados.")
        setattr(self, name, SGDClassifier())
        classifier = getattr(self, name)
        indices = np.searchsorted(self.ids, ids)
        classifier.fit(self.tfidf_mat[indices, :], labels)

Source File: language_detector.py From text-mining-class with MIT License

6 votes

def build_language_classifier(texts, labels, verbose=False, random_state=None):
    """Train a text classifier with scikit-learn

    The text classifier is composed of two elements assembled in a pipeline:

    - A text feature extractor (`TfidfVectorizer`) that extract the relative
      frequencies of unigrams, bigrams and trigrams of characters in the text.

    - An instance of `SGDClassifier` for the classification it-self. To speed
      up training it is recommended to enable early stopping.

    `random_state` is passed to the underlying `SGDClassifier` instance.
    """
    language_classifier = make_pipeline(
        TfidfVectorizer(analyzer="char", ngram_range=(1, 3),
                        min_df=2, max_df=0.9, norm="l2", dtype=np.float32),
        SGDClassifier(early_stopping=True, validation_fraction=0.2,
                      n_iter_no_change=3, max_iter=1000, tol=1e-3,
                      alpha=1e-5, penalty="l2", verbose=verbose,
                      random_state=random_state)
    )
    return language_classifier.fit(texts, labels)

Source File: testScoreWithAdapaSklearn.py From nyoka with Apache License 2.0

6 votes

def test_09_sgd_classifier(self):
        print("\ntest 09 (SGD Classifier with preprocessing) [multi-class]\n")
        X, X_test, y, features, target, test_file = self.data_utility.get_data_for_multi_class_classification()

        model = SGDClassifier(loss="log")
        pipeline_obj = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model)
        ])
        pipeline_obj.fit(X,y)
        file_name = 'test09sklearn.pmml'
        
        skl_to_pmml(pipeline_obj, features, target, file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, test_file)
        model_pred = pipeline_obj.predict(X_test)
        model_prob = pipeline_obj.predict_proba(X_test)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
        self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)

Source File: test_incremental.py From dask-ml with BSD 3-Clause "New" or "Revised" License

6 votes

def test_incremental_text_pipeline(container):
    X = pd.Series(["a list", "of words", "for classification"] * 100)
    X = dd.from_pandas(X, npartitions=3)

    if container == "bag":
        X = X.to_bag()

    y = da.from_array(np.array([0, 0, 1] * 100), chunks=(100,) * 3)

    assert tuple(X.map_partitions(len).compute()) == y.chunks[0]

    sgd = SGDClassifier(max_iter=5, tol=1e-3)
    clf = Incremental(sgd, scoring="accuracy", assume_equal_chunks=True)
    vect = dask_ml.feature_extraction.text.HashingVectorizer()
    pipe = make_pipeline(vect, clf)

    pipe.fit(X, y, incremental__classes=[0, 1])
    X2 = pipe.steps[0][1].transform(X)
    assert hasattr(clf, "coef_")

    X2.compute_chunk_sizes()
    assert X2.shape == (300, vect.n_features)

Source File: testScoreWithAdapaSklearn.py From nyoka with Apache License 2.0

6 votes

def test_10_sgd_classifier(self):
        print("\ntest 10 (SGD Classifier with preprocessing) [binary-class]\n")
        X, X_test, y, features, target, test_file = self.data_utility.get_data_for_binary_classification()

        model = SGDClassifier(loss="log")
        pipeline_obj = Pipeline([
            ("scaler", StandardScaler()),
            ("model", model)
        ])
        pipeline_obj.fit(X,y)
        file_name = 'test10sklearn.pmml'
        
        skl_to_pmml(pipeline_obj, features, target, file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, probabilities = self.adapa_utility.score_in_zserver(model_name, test_file)
        model_pred = pipeline_obj.predict(X_test)
        model_prob = pipeline_obj.predict_proba(X_test)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
        self.assertEqual(self.adapa_utility.compare_probability(probabilities, model_prob), True)

Source File: svm.py From opentc with MIT License

5 votes

def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)

Source File: test_incremental.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_in_gridsearch(scheduler, xy_classification):
    X, y = xy_classification
    clf = Incremental(SGDClassifier(random_state=0, tol=1e-3))
    param_grid = {"estimator__alpha": [0.1, 10]}
    gs = sklearn.model_selection.GridSearchCV(clf, param_grid, cv=3)

    with scheduler() as (s, [a, b]):
        gs.fit(X, y, classes=[0, 1])

Source File: Classifier.py From tatk with Apache License 2.0

5 votes

def train(self, X, y):
        model = SGDClassifier(loss="log", penalty="l2")
        model.probability=True
        model.fit(X,y)
        self.model = model

Source File: test_incremental.py From dask-ml with BSD 3-Clause "New" or "Revised" License

5 votes

def test_fit_ndarrays():
    X = np.ones((10, 5))
    y = np.concatenate([np.zeros(5), np.ones(5)])

    sgd = SGDClassifier(tol=1e-3)
    inc = Incremental(sgd)

    inc.partial_fit(X, y, classes=[0, 1])
    sgd.fit(X, y)

    assert inc.estimator is sgd
    assert_eq(inc.coef_, inc.estimator_.coef_)

Source File: run.py From LeadQualifier with MIT License

5 votes

def runSGD(X_train, y_train):
    sgd = SGDClassifier(n_iter=500, loss='modified_huber', penalty='elasticnet', random_state=42)
    sgd.fit(X_train, y_train)
    return sgd

Source File: model.py From polyaxon-examples with Apache License 2.0

5 votes

def model(X, y, loss, penalty, l1_ratio, max_iter, tol):
    classifier = SGDClassifier(
        loss=loss,
        penalty=penalty,
        l1_ratio=l1_ratio,
        max_iter=max_iter,
        tol=tol,
    )
    return cross_val_score(classifier, X, y, cv=5)

Source File: model.py From polyaxon-examples with Apache License 2.0

5 votes

def model(X, y, loss, penalty, l1_ratio, max_iter, tol):
    classifier = SGDClassifier(
        loss=loss,
        penalty=penalty,
        l1_ratio=l1_ratio,
        max_iter=max_iter,
        tol=tol,
    )
    return cross_val_score(classifier, X, y, cv=5)

Source File: sgd_separator.py From sklearn_pydata2015 with BSD 3-Clause "New" or "Revised" License

5 votes

def plot_sgd_separator():
    # we create 50 separable points
    X, Y = make_blobs(n_samples=50, centers=2,
                      random_state=0, cluster_std=0.60)

    # fit the model
    clf = SGDClassifier(loss="hinge", alpha=0.01,
                        n_iter=200, fit_intercept=True)
    clf.fit(X, Y)

    # plot the line, the points, and the nearest vectors to the plane
    xx = np.linspace(-1, 5, 10)
    yy = np.linspace(-1, 5, 10)

    X1, X2 = np.meshgrid(xx, yy)
    Z = np.empty(X1.shape)
    for (i, j), val in np.ndenumerate(X1):
        x1 = val
        x2 = X2[i, j]
        p = clf.decision_function([x1, x2])
        Z[i, j] = p[0]
    levels = [-1.0, 0.0, 1.0]
    linestyles = ['dashed', 'solid', 'dashed']
    colors = 'k'

    ax = plt.axes()
    ax.contour(X1, X2, Z, levels, colors=colors, linestyles=linestyles)
    ax.scatter(X[:, 0], X[:, 1], c=Y, cmap=plt.cm.Paired)

    ax.axis('tight')

Source File: sentiment_analysis_ml.py From Sentiment_Analysis_cnn_lstm_cnnlstm_textcnn_bilstm with Apache License 2.0

5 votes

def SGD_classifier(train_vecs,y_train,test_vecs,y_test):
    clf = SGDClassifier(alpha=0.001, max_iter=100)
    clf.fit(train_vecs,y_train)
    joblib.dump(clf,storedpaths+'model_sgd.pkl')
    test_scores=clf.score(test_vecs,y_test)
    return test_scores
    
# 训练多层感知机分类算法

Python sklearn.linear_model.SGDClassifier() Examples