Python Examples of sklearn.feature_extraction.text.CountVectorizer

Source File: _validateSchema.py From nyoka with Apache License 2.0

7 votes

def test_validate_sklearn_sgd_with_text_cv(self):
        categories = ['alt.atheism','talk.religion.misc']
        data = fetch_20newsgroups(subset='train', categories=categories)
        X = data.data[:4]
        Y = data.target[:4]
        features = ['input']
        target = 'output'
        model = SGDClassifier(loss="log")
        file_name = model.__class__.__name__ + '_CountVec_.pmml'
        pipeline = Pipeline([
            ('vect', CountVectorizer()),
            ('clf', model)
        ])
        pipeline.fit(X, Y)
        skl_to_pmml(pipeline, features , target, file_name)
        self.assertEqual(self.schema.is_valid(file_name), True)

Source File: feature.py From text-classifier with Apache License 2.0

6 votes

def tf_word_feature(self, data_set):
        """
        Get TF feature by word
        :param data_set:
        :return:
        """
        data_set = get_word_segment_data(data_set)
        if self.is_infer:
            self.vectorizer = load_pkl(self.feature_vec_path)
            data_feature = self.vectorizer.transform(data_set)
        else:
            self.vectorizer = CountVectorizer(vocabulary=self.word_vocab)
            data_feature = self.vectorizer.fit_transform(data_set)
        vocab = self.vectorizer.vocabulary_
        logger.info('Vocab size:%d' % len(vocab))
        feature_names = self.vectorizer.get_feature_names()
        logger.info('feature_names:%s' % feature_names[:20])
        logger.info(data_feature.shape)
        if not self.is_infer:
            save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True)
        return data_feature

Source File: LDA_Analysis.py From Spider with MIT License

6 votes

def word2vec(word_list,n_features=1000,topics = 5):
    tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                    max_features=n_features,
                                    #stop_words='english',
                                    max_df=0.5,
                                    min_df=10)
    tf = tf_vectorizer.fit_transform(word_list)

    lda = LatentDirichletAllocation(n_components=topics,#主题数
                                    learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好，这样可以少很多参数要调
                                    )
    #用变分贝叶斯方法训练模型
    lda.fit(tf)

    #依次输出每个主题的关键词表
    tf_feature_names = tf_vectorizer.get_feature_names()

    return lda,tf,tf_feature_names,tf_vectorizer

#将主题以可视化结果展现出来

Source File: ngram_featurizer.py From metal with Apache License 2.0

6 votes

def __init__(
        self,
        anonymize=True,
        trim_window=5,
        lowercase=True,
        drop_stopwords=True,
        stem=True,
        ngram_range=(1, 3),
        **vectorizer_kwargs,
    ):
        self.anonymize = anonymize
        self.lowercase = lowercase
        self.drop_stopwords = drop_stopwords
        if drop_stopwords:
            nltk.download("stopwords")
            self.stopwords = set(nltk.corpus.stopwords.words("english"))
        self.trim_window = trim_window
        self.stem = stem
        if stem:
            self.porter = nltk.PorterStemmer()

        self.vectorizer = CountVectorizer(
            ngram_range=ngram_range, binary=True, **vectorizer_kwargs
        )

Source File: test_termDocMatrixFromScikit.py From scattertext with Apache License 2.0

6 votes

def test_build(self):
		from sklearn.feature_extraction.text import CountVectorizer
		categories, docs = get_docs_categories_semiotic()
		idx_store = IndexStore()
		y = np.array([idx_store.getidx(c) for c in categories])
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(docs)
		term_doc_mat = TermDocMatrixFromScikit(
			X=X_counts,
			y=y,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=idx_store.values()).build()
		self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly'])
		self.assertEqual(term_doc_mat
		                 .get_term_freq_df()
		                 .assign(score=term_doc_mat.get_scaled_f_scores('hamlet'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['that', 'march', 'did', 'majesty', 'sometimes'])

Source File: test_corpusFromScikit.py From scattertext with Apache License 2.0

6 votes

def _te_ss_t_build(self):
		from sklearn.datasets import fetch_20newsgroups
		from sklearn.feature_extraction.text import CountVectorizer

		newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes'))
		count_vectorizer = CountVectorizer()
		X_counts = count_vectorizer.fit_transform(newsgroups_train.data)
		corpus = CorpusFromScikit(
			X=X_counts,
			y=newsgroups_train.target,
			feature_vocabulary=count_vectorizer.vocabulary_,
			category_names=newsgroups_train.target_names,
			raw_texts=newsgroups_train.data
		).build()
		self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics'])
		self.assertEqual(corpus
		                 .get_term_freq_df()
		                 .assign(score=corpus.get_scaled_f_scores('alt.atheism'))
		                 .sort_values(by='score', ascending=False).index.tolist()[:5],
		                 ['atheism', 'atheists', 'islam', 'atheist', 'belief'])
		self.assertGreater(len(corpus.get_texts()[0]), 5)

Source File: testScoreWithAdapaLgbm.py From nyoka with Apache License 2.0

6 votes

def test_04_lgbm_regressor(self):
        print("\ntest 04 (lgbm regressor with preprocessing)\n")
        auto = pd.read_csv('nyoka/tests/auto-mpg.csv')
        X = auto.drop(['mpg'], axis=1)
        y = auto['mpg']

        feature_names = [name for name in auto.columns if name not in ('mpg')]
        target_name='mpg'
        x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101)
        pd.DataFrame(data=x_test, columns=feature_names).to_csv("test.csv",index=False)
        pipeline_obj = Pipeline([
            ('mapper', DataFrameMapper([
                ('car name', CountVectorizer()),
                (['displacement'],[StandardScaler()]) 
            ])),
            ('lgbmr',LGBMRegressor())
        ])
        pipeline_obj.fit(x_train,y_train)
        file_name = "test04lgbm.pmml"
        lgb_to_pmml(pipeline_obj, feature_names, 'mpg', file_name)
        model_name  = self.adapa_utility.upload_to_zserver(file_name)
        predictions, _ = self.adapa_utility.score_in_zserver(model_name, "test.csv")
        predictions = numpy.array(predictions)
        model_pred = pipeline_obj.predict(x_test)
        self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)

Source File: test_pmi_w2l.py From chowmein with MIT License

6 votes

def test_from_texts():
    cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0),
                        doc2label_vectorizer=LabelCountVectorizer())
    actual = cal.from_texts(docs, labels)
    assert_equal(actual.shape[1], 4)
    assert_equal(actual.shape[0], 9)
    assert_equal(cal.index2word_, {0: u'information',
                                   1: u'language',
                                   2: u'learning',
                                   3: u'machine',
                                   4: u'mining',
                                   5: u'natural',
                                   6: u'processing',
                                   7: u'retrieval',
                                   8: u'text'})
    assert_equal(cal.index2label_, {0: 'information retrieval'.split(),
                                    1: 'machine learning'.split(),
                                    2: 'natural language processing'.split(),
                                    3: 'text mining'.split()})

Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

6 votes

def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        """
        Build SKLearn vectorization pipeline for this field.
        This is used in field-based machine learning when we calculate value of one field based on the
        values of other fields of this document.

        We are able to detect only choice fields this way at the moment.

        To reach this we need to build a feature vector of all dependencies of the field being detected.
        This feature vector is built as a union of feature vectors of each dependency.

        See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)

        :return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
                           2. List of str feature names or a function returning list of str feature names.
        """

        vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                               stop_words=self._build_stop_words())
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', vect),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)

Source File: data_loader.py From PathCon with MIT License

6 votes

def read_relations(file_name):
    bow = []
    count_vec = CountVectorizer()

    d = {}
    file = open(file_name)
    for line in file:
        index, name = line.strip().split('\t')
        d[name] = int(index)

        if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'):
            tokens = re.findall('[a-z]{2,}', name)
            bow.append(' '.join(tokens))
    file.close()

    if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'):
        bow = count_vec.fit_transform(bow)
        np.save('../data/' + args.dataset + '/bow.npy', bow.toarray())

    return d

Source File: stop.py From cltk with MIT License

6 votes

def __init__(self, language=None):
        """ Initialize stoplist builder with option for language specific parameters
        :type language: str
        :param language : text from which to build the stoplist
        """
        if language:
            self.language = language.lower()
        self.numpy_installed = True  # Write utility for common import traps?
        self.sklearn_installed = True

        try:
            import numpy as np
            self.np = np
        except ImportError:
            self.numpy_installed = False

        try:
            from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
            # self.vectorizer = CountVectorizer(input='content') # Set df?
            # self.tfidf_vectorizer = TfidfVectorizer()
        except ImportError:
            self.sklearn_installed = False

Source File: common_utils.py From interpret-text with MIT License

6 votes

def create_logistic_vectorizer():
    vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True)
    lr = LogisticRegression(random_state=777)
    return Pipeline([("vectorizer", vectorizer), ("lr", lr)])

Source File: feature_extraction.py From fanci with GNU General Public License v3.0

6 votes

def _n_grams():
    """
    Calculates various statistical features over the 1-,2- and 3-grams of the suffix and dot free domain
    :return: 
    """
    global __unigram
    feature = []

    for i in range(1,4):
        ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(i, i))
        counts = ngram_vectorizer.build_analyzer()(__joined_dot_split_suffix_free)
        npa = numpy.array(list(Counter(counts).values()), dtype=int)
        if i == 1:
            __unigram = npa

        feature += __stats_over_n_grams(npa)

    return feature

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def vectorizer_factory(self):
        return CountVectorizer(min_df=1, dtype=np.int32)

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def vectorizer_factory(self):
        return CountVectorizer(min_df=1, dtype=np.int32)

Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

5 votes

def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        count_vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word',
                                           stop_words=self._build_stop_words(),
                                           preprocessor=vectorizers.set_items_as_tokens_preprocessor,
                                           tokenizer=vectorizers.set_items_as_tokens)
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', count_vectorizer),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(count_vectorizer)

Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

5 votes

def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        count_vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word',
                                           stop_words=self._build_stop_words(),
                                           preprocessor=vectorizers.set_items_as_tokens_preprocessor,
                                           tokenizer=vectorizers.set_items_as_tokens)
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', count_vectorizer),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(count_vectorizer)

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def vectorizer_factory(self):
        return CountVectorizer(min_df=1, dtype=np.int32)

Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

5 votes

def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                               stop_words=self._build_stop_words(), tokenizer=vectorizers.whole_value_as_token)
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', vect),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)

Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

5 votes

def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                               stop_words=self._build_stop_words(),
                               tokenizer=vectorizers.whole_value_as_token)
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', vect),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def vectorizer_factory(self):
        return CountVectorizer(min_df=1, dtype=np.int32)

Source File: stops.py From cltk with MIT License

5 votes

def __init__(self):
        BaseCorpusStoplist.__init__(self)
        self.punctuation = punctuation
        if self.numpy_installed and self.sklearn_installed:
            from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
            self.vectorizer = CountVectorizer(input='content')  # Set df?
            self.tfidf_vectorizer = TfidfVectorizer(input='content')

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def vectorizer_factory(self):
        return CountVectorizer(min_df=1, dtype=np.int32)

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def vectorizer_factory(self):
        return CountVectorizer(min_df=1, dtype=np.int32)

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def vectorizer_factory(self):
        return CountVectorizer(min_df=1)

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def vectorizer_factory(self):
        return CountVectorizer(min_df=2)

Source File: chi2_analyzer.py From assistant-dialog-skill-analysis with Apache License 2.0

5 votes

def _preprocess_chi2(workspace_pd):
    """
    Preprocess dataframe for chi2 analysis
    :param workspace_pd: Preprocess dataframe for chi2
    :return labels: intents processed
    :return count_vectorizer: vectorizer instance
    :return features: features from transform
    """
    stopword_list = skills_util.STOP_WORDS

    workspace_pd["utterance_punc_stripped"] = workspace_pd["utterance"].apply(
        strip_punctuations
    )

    count_vectorizer = CountVectorizer(
        min_df=1,
        encoding="utf-8",
        ngram_range=(1, 2),
        stop_words=stopword_list,
        tokenizer=word_tokenize,
        token_pattern="(?u)\b\w+\b",
    )
    features = count_vectorizer.fit_transform(
        workspace_pd["utterance_punc_stripped"]
    ).toarray()
    labels = workspace_pd["intent"]
    return labels, count_vectorizer, features

Source File: classical_chinese.py From cltk with MIT License

5 votes

def __init__(self, language='classical_chinese'):
        BaseCorpusStoplist.__init__(self, language)
        self.punctuation = '。，；？：！、《》'
        if self.numpy_installed and self.sklearn_installed:
            from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
            self.vectorizer = CountVectorizer(analyzer='char', input='content') # Set df?
            self.tfidf_vectorizer = TfidfVectorizer(analyzer='char', input='content')

Source File: latin.py From cltk with MIT License

5 votes

def __init__(self, language='latin'):
        BaseCorpusStoplist.__init__(self, language)
        self.punctuation = punctuation
        if self.numpy_installed and self.sklearn_installed:
            from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
            self.vectorizer = CountVectorizer(input='content')  # Set df?
            self.tfidf_vectorizer = TfidfVectorizer(input='content')


# Reference lists

Source File: stops.py From cltk with MIT License

5 votes

def __init__(self,):
        BaseCorpusStoplist.__init__(self)
        self.punctuation = punctuation
        if not self.numpy_installed or not self.sklearn_installed:
            print('\n\nThe Corpus-based Stoplist method requires numpy and scikit-learn for calculations. '
                  'Try installing with `pip install numpy sklearn scipy`.\n\n')
            raise ImportError
        else:
            from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
            self.vectorizer = CountVectorizer(input='content')  # Set df?
            self.tfidf_vectorizer = TfidfVectorizer(input='content')

Python sklearn.feature_extraction.text.CountVectorizer() Examples