Python Examples of sklearn.naive

Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License

9 votes

def test_ovr_multilabel():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 4, 5], [0, 5, 0], [3, 3, 3], [4, 0, 6], [6, 0, 0]])
    y = np.array([[0, 1, 1],
                  [0, 1, 0],
                  [1, 1, 1],
                  [1, 0, 1],
                  [1, 0, 0]])

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(),
                     ElasticNet(), Lasso(alpha=0.5)):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        y_pred = clf.predict([[0, 4, 4]])[0]
        assert_array_equal(y_pred, [0, 1, 1])
        assert clf.multilabel_

Source File: test_naive_bayes.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_mnb_prior_unobserved_targets():
    # test smoothing of prior for yet unobserved targets

    # Create toy training data
    X = np.array([[0, 1], [1, 0]])
    y = np.array([0, 1])

    clf = MultinomialNB()

    assert_no_warnings(
        clf.partial_fit, X, y, classes=[0, 1, 2]
    )

    assert clf.predict([[0, 1]]) == 0
    assert clf.predict([[1, 0]]) == 1
    assert clf.predict([[1, 1]]) == 0

    # add a training example with previously unobserved class
    assert_no_warnings(
        clf.partial_fit, [[1, 1]], [2]
    )

    assert clf.predict([[0, 1]]) == 0
    assert clf.predict([[1, 0]]) == 1
    assert clf.predict([[1, 1]]) == 2

Source File: test_sklearn_calibrated_classifier_cv_converter.py From sklearn-onnx with MIT License

6 votes

def test_model_calibrated_classifier_cv_binary(self):
        data = load_iris()
        X, y = data.data, data.target
        y[y > 1] = 1
        clf = MultinomialNB().fit(X, y)
        model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn CalibratedClassifierCV",
            [("input", FloatTensorType([None, X.shape[1]]))],
            target_opset=TARGET_OPSET
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.float32),
            model,
            model_onnx,
            basename="SklearnCalibratedClassifierCVBinaryMNB",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_ovr_multiclass():
    # Toy dataset where features correspond directly to labels.
    X = np.array([[0, 0, 5], [0, 5, 0], [3, 0, 0], [0, 0, 6], [6, 0, 0]])
    y = ["eggs", "spam", "ham", "eggs", "ham"]
    Y = np.array([[0, 0, 1],
                  [0, 1, 0],
                  [1, 0, 0],
                  [0, 0, 1],
                  [1, 0, 0]])

    classes = set("ham eggs spam".split())

    for base_clf in (MultinomialNB(), LinearSVC(random_state=0),
                     LinearRegression(), Ridge(),
                     ElasticNet()):
        clf = OneVsRestClassifier(base_clf).fit(X, y)
        assert_equal(set(clf.classes_), classes)
        y_pred = clf.predict(np.array([[0, 0, 4]]))[0]
        assert_array_equal(y_pred, ["eggs"])

        # test input as label indicator matrix
        clf = OneVsRestClassifier(base_clf).fit(X, Y)
        y_pred = clf.predict([[0, 0, 4]])[0]
        assert_array_equal(y_pred, [0, 0, 1])

Source File: test_base.py From scikit-multilearn with BSD 2-Clause "Simplified" License

6 votes

def test_model_selection_works(self):
        for x,y in self.get_multilabel_data_for_tests('dense'):
            parameters = {
                'classifier': [LabelPowerset(), BinaryRelevance()],
                'clusterer': [RandomLabelSpaceClusterer(None, None, False)],
                'clusterer__cluster_size': list(range(2, 3)),
                'clusterer__cluster_count': [3],
                'clusterer__allow_overlap': [False],
                'classifier__classifier': [MultinomialNB()],
                'classifier__classifier__alpha': [0.7, 1.0],
            }

            clf = GridSearchCV(LabelSpacePartitioningClassifier(), parameters, scoring='f1_macro')
            clf.fit(x, y)

            for p in list(parameters.keys()):
                self.assertIn(p, clf.best_params_)

            self.assertIsNotNone(clf.best_score_)

Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_ovr_fit_predict():
    # A classifier which implements decision_function.
    ovr = OneVsRestClassifier(LinearSVC(random_state=0))
    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(len(ovr.estimators_), n_classes)

    clf = LinearSVC(random_state=0)
    pred2 = clf.fit(iris.data, iris.target).predict(iris.data)
    assert_equal(np.mean(iris.target == pred), np.mean(iris.target == pred2))

    # A classifier which implements predict_proba.
    ovr = OneVsRestClassifier(MultinomialNB())
    pred = ovr.fit(iris.data, iris.target).predict(iris.data)
    assert_greater(np.mean(iris.target == pred), 0.65)


# 0.23. warning about tol not having its correct default value.

Source File: test_sklearn_calibrated_classifier_cv_converter.py From sklearn-onnx with MIT License

6 votes

def test_model_calibrated_classifier_cv_int(self):
        data = load_digits()
        X, y = data.data, data.target
        clf = MultinomialNB().fit(X, y)
        model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn CalibratedClassifierCVMNB",
            [("input", Int64TensorType([None, X.shape[1]]))],
            target_opset=TARGET_OPSET
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.int64),
            model,
            model_onnx,
            basename="SklearnCalibratedClassifierCVInt-Dec4",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: test_naive_bayes.py From sparkit-learn with Apache License 2.0

6 votes

def test_same_prediction(self):
        X, y, Z = self.make_classification(4, 100000, nonnegative=True)

        local = MultinomialNB()
        dist = SparkMultinomialNB()

        y_local = local.fit(X, y).predict(X)
        y_dist = dist.fit(Z, classes=np.unique(y)).predict(Z[:, 'X'])
        y_converted = dist.to_scikit().predict(X)

        assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
        assert_array_almost_equal(y_local, y_dist.toarray())
        assert_array_almost_equal(y_local, y_converted)

        y_proba_local = local.fit(X, y).predict_proba(X)
        y_proba_dist = dist.fit(Z, classes=np.unique(y)).predict_proba(Z[:, 'X'])
        y_proba_converted = dist.to_scikit().predict_proba(X)

        assert_true(check_rdd_dtype(y_dist, (np.ndarray,)))
        assert_array_almost_equal(y_proba_local, y_proba_dist.toarray(), 5)
        assert_array_almost_equal(y_proba_local, y_proba_converted, 5)

Source File: test_grid_search.py From sparkit-learn with Apache License 2.0

6 votes

def test_same_result(self):
        X, y, Z = self.make_classification(2, 40000, nonnegative=True)

        parameters = {'alpha': [0.1, 1, 10]}
        fit_params = {'classes': np.unique(y)}

        local_estimator = MultinomialNB()
        local_grid = GridSearchCV(estimator=local_estimator,
                                  param_grid=parameters)

        estimator = SparkMultinomialNB()
        grid = SparkGridSearchCV(estimator=estimator,
                                 param_grid=parameters,
                                 fit_params=fit_params)

        local_grid.fit(X, y)
        grid.fit(Z)

        locscores = [r.mean_validation_score for r in local_grid.grid_scores_]
        scores = [r.mean_validation_score for r in grid.grid_scores_]

        assert_array_almost_equal(locscores, scores, decimal=2)

Source File: meta_des.py From DESlib with BSD 3-Clause "New" or "Revised" License

6 votes

def _fit_meta_classifier(self, X_meta, y_meta):
        """Train the meta-classifier :math:`\\lambda`, using
        the meta-training dataset.

        Parameters
        ----------
        X_meta : array of shape = [n_meta_examples, n_meta_features]
                 The meta-training examples.

        y_meta : array of shape = [n_meta_examples]
            Class labels of each example in X_test. 1 whether the base
            classifier made the correct prediction, otherwise 0.

        """
        if isinstance(self.meta_classifier_, MultinomialNB):
            # Digitize the data (Same implementation we have on PRTools)
            X_meta = np.digitize(X_meta, np.linspace(0.1, 1, 10))

        self.meta_classifier_.fit(X_meta, y_meta)

Source File: ml.py From vector-homomorphic-encryption with MIT License

6 votes

def trainNB(trainX,trainY,testX,testY,samples,limit):
    start = time.clock()
    clf = MultinomialNB()
    clf.fit(trainX[:samples], trainY[:samples])
    print time.clock()-start
    start = time.clock()
    
    predicted = clf.predict(trainX[0:samples])
    print "percent Trained correct: ", percentCorrect(trainY[:samples],predicted)
    print "f-score: ", f1_score(trainY[:samples],predicted)
    metric = precision_recall_fscore_support(trainY[:samples],predicted)
    print "precision: ", metric[0]
    print "recall: ", metric[1]

    predicted = clf.predict(testX[0:limit])
    print "percent Test correct: ", percentCorrect(testY[:limit],predicted)
    print "f-score: ", f1_score(testY[:limit],predicted)
    metric = precision_recall_fscore_support(testY[:limit],predicted)
    print "precision: ", metric[0]
    print "recall: ", metric[1]

    print time.clock()-start
    return clf

Source File: test_multiclass.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_ovr_single_label_predict_proba():
    base_clf = MultinomialNB(alpha=1)
    X, Y = iris.data, iris.target
    X_train, Y_train = X[:80], Y[:80]
    X_test = X[80:]
    clf = OneVsRestClassifier(base_clf).fit(X_train, Y_train)

    # Decision function only estimator.
    decision_only = OneVsRestClassifier(svm.SVR(gamma='scale')
                                        ).fit(X_train, Y_train)
    assert not hasattr(decision_only, 'predict_proba')

    Y_pred = clf.predict(X_test)
    Y_proba = clf.predict_proba(X_test)

    assert_almost_equal(Y_proba.sum(axis=1), 1.0)
    # predict assigns a label if the probability that the
    # sample has the label is greater than 0.5.
    pred = np.array([l.argmax() for l in Y_proba])
    assert not (pred - Y_pred).any()

Source File: test_naive_bayes.py From twitter-stock-recommendation with MIT License

6 votes

def test_discretenb_pickle():
    # Test picklability of discrete naive Bayes classifiers

    for cls in [BernoulliNB, MultinomialNB, GaussianNB]:
        clf = cls().fit(X2, y2)
        y_pred = clf.predict(X2)

        store = BytesIO()
        pickle.dump(clf, store)
        clf = pickle.load(BytesIO(store.getvalue()))

        assert_array_equal(y_pred, clf.predict(X2))

        if cls is not GaussianNB:
            # TODO re-enable me when partial_fit is implemented for GaussianNB

            # Test pickling of estimator trained with partial_fit
            clf2 = cls().partial_fit(X2[:3], y2[:3], classes=np.unique(y2))
            clf2.partial_fit(X2[3:], y2[3:])
            store = BytesIO()
            pickle.dump(clf2, store)
            clf2 = pickle.load(BytesIO(store.getvalue()))
            assert_array_equal(y_pred, clf2.predict(X2))

Source File: test_sklearn_calibrated_classifier_cv_converter.py From sklearn-onnx with MIT License

6 votes

def test_model_calibrated_classifier_cv_float_nozipmap(self):
        data = load_iris()
        X, y = data.data, data.target
        clf = MultinomialNB().fit(X, y)
        model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
        model_onnx = convert_sklearn(
            model, "scikit-learn CalibratedClassifierCVMNB",
            [("input", FloatTensorType([None, X.shape[1]]))],
            target_opset=TARGET_OPSET,
            options={id(model): {'zipmap': False}})
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.float32), model, model_onnx,
            basename="SklearnCalibratedClassifierCVFloatNoZipMap",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')")

Source File: test_sklearn_calibrated_classifier_cv_converter.py From sklearn-onnx with MIT License

6 votes

def test_model_calibrated_classifier_cv_float(self):
        data = load_iris()
        X, y = data.data, data.target
        clf = MultinomialNB().fit(X, y)
        model = CalibratedClassifierCV(clf, cv=2, method="sigmoid").fit(X, y)
        model_onnx = convert_sklearn(
            model,
            "scikit-learn CalibratedClassifierCVMNB",
            [("input", FloatTensorType([None, X.shape[1]]))],
            target_opset=TARGET_OPSET
        )
        self.assertTrue(model_onnx is not None)
        dump_data_and_model(
            X.astype(np.float32),
            model,
            model_onnx,
            basename="SklearnCalibratedClassifierCVFloat",
            allow_failure="StrictVersion(onnxruntime.__version__)"
            "<= StrictVersion('0.2.1')",
        )

Source File: 04_sent.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License

6 votes

def create_union_model(params=None):
    def preprocessor(tweet):
        tweet = tweet.lower()

        for k in emo_repl_order:
            tweet = tweet.replace(k, emo_repl[k])
        for r, repl in re_repl.iteritems():
            tweet = re.sub(r, repl, tweet)

        return tweet.replace("-", " ").replace("_", " ")

    tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor,
                                   analyzer="word")
    ling_stats = LinguisticVectorizer()
    all_features = FeatureUnion(
        [('ling', ling_stats), ('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('tfidf', tfidf_ngrams)])
    #all_features = FeatureUnion([('ling', ling_stats)])
    clf = MultinomialNB()
    pipeline = Pipeline([('all', all_features), ('clf', clf)])

    if params:
        pipeline.set_params(**params)

    return pipeline

Source File: mp_train.py From atap with Apache License 2.0

6 votes

def fit_naive_bayes(path, saveto=None, cv=12):

    model = Pipeline([
        ('norm', TextNormalizer()),
        ('tfidf', TfidfVectorizer(tokenizer=identity, lowercase=False)),
        ('clf', MultinomialNB())
    ])

    if saveto is None:
        saveto = "naive_bayes_{}.pkl".format(time.time())

    scores, delta = train_model(path, model, saveto, cv)
    logger.info((
        "naive bayes training took {:0.2f} seconds "
        "with an average score of {:0.3f}"
    ).format(delta, scores.mean()))

Source File: multinomial_nb_clf.py From 2020plus with Apache License 2.0

6 votes

def __init__(self, df, weight=True, min_ct=0, total_iter=5):
        self.logger = logging.getLogger(__name__)
        super(MultinomialNaiveBayes, self).__init__(total_iterations=total_iter)  # call base constructor
        #self.set_min_count(min_ct)
        self.is_weighted_sample = weight

        # process data
        #df = self._filter_rows(df)  # filter out low count rows
        # row_sums = df.sum(axis=1).astype(float)
        # df = df.div(row_sums, axis=0)  # normalize each row
        # df = df.mul(100)
        # df.to_csv('tmp.nbclf.txt', sep='\t')
        df = df.fillna(df.mean())
        total = df['total']
        df = df[['recurrent missense', 'recurrent indel', 'frame shift',
                 'nonsense', 'missense', 'synonymous', 'inframe indel', 'no protein',
                 'lost stop', 'splicing mutation']]
        df = df.mul(total, axis=0).astype(int)  # get back counts instead of pct
        self.x, self.y = features.randomize(df)

        # setup classifier
        self.clf = MultinomialNB(alpha=1,  # laplacian smooth, i.e. pseudocounts
                                 fit_prior=True)  # use data for prior class probs

Source File: test_naive_bayes.py From twitter-stock-recommendation with MIT License

6 votes

def test_discretenb_provide_prior_with_partial_fit():
    # Test whether discrete NB classes use provided prior
    # when using partial_fit

    iris = load_iris()
    iris_data1, iris_data2, iris_target1, iris_target2 = train_test_split(
        iris.data, iris.target, test_size=0.4, random_state=415)

    for cls in [BernoulliNB, MultinomialNB]:
        for prior in [None, [0.3, 0.3, 0.4]]:
            clf_full = cls(class_prior=prior)
            clf_full.fit(iris.data, iris.target)
            clf_partial = cls(class_prior=prior)
            clf_partial.partial_fit(iris_data1, iris_target1,
                                    classes=[0, 1, 2])
            clf_partial.partial_fit(iris_data2, iris_target2)
            assert_array_almost_equal(clf_full.class_log_prior_,
                                      clf_partial.class_log_prior_)

Source File: hybrid_nb.py From Jacinle with MIT License

6 votes

def __init__(self, distributions, weights=None, **kwargs):
        self.models = []
        for dist in distributions:
            dist = NaiveBayesianDistribution.from_string(dist)
            if dist is NaiveBayesianDistribution.GAUSSIAN:
                model = nb.GaussianNB(**kwargs)
            elif dist is NaiveBayesianDistribution.MULTINOMIAL:
                model = nb.MultinomialNB(**kwargs)
            elif dist is NaiveBayesianDistribution.BERNOULLI:
                model = nb.BernoulliNB(**kwargs)
            else:
                raise ValueError('Unknown distribution: {}.'.format(dist))
            kwargs['fit_prior'] = False  # Except the first model.
            self.models.append(model)

        self.weights = weights

Source File: test_naive_bayes.py From twitter-stock-recommendation with MIT License

5 votes

def test_discretenb_uniform_prior():
    # Test whether discrete NB classes fit a uniform prior
    # when fit_prior=False and class_prior=None

    for cls in [BernoulliNB, MultinomialNB]:
        clf = cls()
        clf.set_params(fit_prior=False)
        clf.fit([[0], [0], [1]], [0, 0, 1])
        prior = np.exp(clf.class_log_prior_)
        assert_array_equal(prior, np.array([.5, .5]))

Source File: text_classification.py From intro_ds with Apache License 2.0

5 votes

def trainMultinomialNB(data):
    """
    使用多项式模型对数据进行建模
    """
    pipe = Pipeline([("vect", CountVectorizer(token_pattern=r"(?u)\b\w+\b")),
        ("model", MultinomialNB())])
    le = LabelEncoder()
    Y = le.fit_transform(data["label"])
    pipe.fit(data["content"], Y)
    return le, pipe

Source File: multinomial_nb.py From lale with Apache License 2.0

5 votes

def __init__(self, alpha=1.0, fit_prior=True, class_prior=None):
        self._hyperparams = {
            'alpha': alpha,
            'fit_prior': fit_prior,
            'class_prior': class_prior}
        self._wrapped_model = Op(**self._hyperparams)

Source File: recipe_classification.py From Flavor-Network with GNU General Public License v3.0

5 votes

def nb_test(X,y):
    X_train, X_test, y_train, y_test = train_test_split(X,y,random_state=1)
    model = MultinomialNB()
    model.fit(X_train, y_train)
    y_pred = model.predict(X_test)
    print metrics.accuracy_score(y_test,y_pred)

Source File: nb.py From asreview with Apache License 2.0

5 votes

def __init__(self, alpha=3.822):
        """Initialize the SKLearn Naive Bayes model.

        Arguments:
        ----------
        alpha: float
            Parameter to set the regularization strength of the model.
        """
        super(NBModel, self).__init__()
        self.alpha = alpha
        self._model = MultinomialNB(alpha=alpha)
        logging.debug(self._model)

Source File: naivebayesclassifier.py From SQG with GNU General Public License v3.0

5 votes

def __init__(self, model_file_path=None):
        super(NaiveBayesClassifier, self).__init__(model_file_path)
        self.pipeline = Pipeline(
            [('vect', CountVectorizer()), ('tf-idf', TfidfTransformer()), ('naive-bayes', MultinomialNB())])
        self.parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tf-idf__use_idf': (True, False),
                           'naive-bayes__alpha': (1e-2, 1e-3)}

Source File: classifier_train.py From pygameweb with BSD 2-Clause "Simplified" License

5 votes

def make_classifier():
    pipeline = Pipeline([
        ('count_vectorizer',   CountVectorizer(ngram_range=(1, 2))),
        ('classifier',         MultinomialNB())
    ])
    return pipeline

Source File: bayes.py From opentc with MIT License

5 votes

def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', MultinomialNB())
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)

Source File: main.py From Python-DevOps with MIT License

5 votes

def train_bayes(corpus,tokenizing=True,cleaning=True,normalizing=True,stem=True,vector='tfidf',split=0.2):
    multinomial,labels,vectorize = None, None, None
    if vector.lower().find('tfidf') < 0 and vector.lower().find('bow'):
        raise Exception('Invalid vectorization technique')
    if isinstance(corpus, str):
        trainset = sklearn.datasets.load_files(container_path = corpus, encoding = 'UTF-8')
        trainset.data, trainset.target = separate_dataset(trainset)
        data, target = trainset.data, trainset.target
        labels = trainset.target_names
    if isinstance(corpus, list) or isinstance(corpus, tuple):
        corpus = np.array(corpus)
        data, target = corpus[:,0].tolist(),corpus[:,1].tolist()
        labels = np.unique(target).tolist()
        target = LabelEncoder().fit_transform(target)
    c = list(zip(data, target))
    random.shuffle(c)
    data, target = zip(*c)
    data, target = list(data), list(target)
    if stem:
        for i in range(len(data)): data[i] = ' '.join([stemming(k) for k in data[i].split()])
    if cleaning:
        for i in range(len(data)): data[i] = clearstring(data[i],tokenizing)
    if vector.lower().find('tfidf') >= 0:
        vectorize = TfidfVectorizer().fit(data)
        vectors = vectorize.transform(data)
    else:
        vectorize = CountVectorizer().fit(data)
        vectors = vectorize.transform(data)
    multinomial = MultinomialNB()
    if split:
        train_X, test_X, train_Y, test_Y = train_test_split(vectors, target, test_size = split)
        multinomial.partial_fit(train_X, train_Y,classes=np.unique(target))
        predicted = multinomial.predict(test_X)
        print(metrics.classification_report(test_Y, predicted, target_names = labels))
    else:
        multinomial.partial_fit(vectors,target,classes=np.unique(target))
        predicted = multinomial.predict(vectors)
        print(metrics.classification_report(target, predicted, target_names = labels))
    return USER_BAYES(multinomial,labels,vectorize)

Source File: simulation_exp4p.py From striatum with BSD 2-Clause "Simplified" License

5 votes

def train_expert(history_context, history_action):
    n_round = len(history_context)
    history_context = np.array([history_context[t] for t in range(n_round)])
    history_action = np.array([history_action[t] for t in range(n_round)])
    logreg = OneVsRestClassifier(LogisticRegression())
    mnb = OneVsRestClassifier(MultinomialNB())
    logreg.fit(history_context, history_action)
    mnb.fit(history_context, history_action)
    return [logreg, mnb]

Python sklearn.naive_bayes.MultinomialNB() Examples