Python sklearn.feature_extraction.text.CountVectorizer() Examples
The following are 30
code examples of sklearn.feature_extraction.text.CountVectorizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_extraction.text
, or try the search function
.
Example #1
Source File: _validateSchema.py From nyoka with Apache License 2.0 | 7 votes |
def test_validate_sklearn_sgd_with_text_cv(self): categories = ['alt.atheism','talk.religion.misc'] data = fetch_20newsgroups(subset='train', categories=categories) X = data.data[:4] Y = data.target[:4] features = ['input'] target = 'output' model = SGDClassifier(loss="log") file_name = model.__class__.__name__ + '_CountVec_.pmml' pipeline = Pipeline([ ('vect', CountVectorizer()), ('clf', model) ]) pipeline.fit(X, Y) skl_to_pmml(pipeline, features , target, file_name) self.assertEqual(self.schema.is_valid(file_name), True)
Example #2
Source File: feature.py From text-classifier with Apache License 2.0 | 6 votes |
def tf_word_feature(self, data_set): """ Get TF feature by word :param data_set: :return: """ data_set = get_word_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = CountVectorizer(vocabulary=self.word_vocab) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) feature_names = self.vectorizer.get_feature_names() logger.info('feature_names:%s' % feature_names[:20]) logger.info(data_feature.shape) if not self.is_infer: save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
Example #3
Source File: LDA_Analysis.py From Spider with MIT License | 6 votes |
def word2vec(word_list,n_features=1000,topics = 5): tf_vectorizer = CountVectorizer(strip_accents='unicode', max_features=n_features, #stop_words='english', max_df=0.5, min_df=10) tf = tf_vectorizer.fit_transform(word_list) lda = LatentDirichletAllocation(n_components=topics,#主题数 learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好,这样可以少很多参数要调 ) #用变分贝叶斯方法训练模型 lda.fit(tf) #依次输出每个主题的关键词表 tf_feature_names = tf_vectorizer.get_feature_names() return lda,tf,tf_feature_names,tf_vectorizer #将主题以可视化结果展现出来
Example #4
Source File: ngram_featurizer.py From metal with Apache License 2.0 | 6 votes |
def __init__( self, anonymize=True, trim_window=5, lowercase=True, drop_stopwords=True, stem=True, ngram_range=(1, 3), **vectorizer_kwargs, ): self.anonymize = anonymize self.lowercase = lowercase self.drop_stopwords = drop_stopwords if drop_stopwords: nltk.download("stopwords") self.stopwords = set(nltk.corpus.stopwords.words("english")) self.trim_window = trim_window self.stem = stem if stem: self.porter = nltk.PorterStemmer() self.vectorizer = CountVectorizer( ngram_range=ngram_range, binary=True, **vectorizer_kwargs )
Example #5
Source File: test_termDocMatrixFromScikit.py From scattertext with Apache License 2.0 | 6 votes |
def test_build(self): from sklearn.feature_extraction.text import CountVectorizer categories, docs = get_docs_categories_semiotic() idx_store = IndexStore() y = np.array([idx_store.getidx(c) for c in categories]) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(docs) term_doc_mat = TermDocMatrixFromScikit( X=X_counts, y=y, feature_vocabulary=count_vectorizer.vocabulary_, category_names=idx_store.values()).build() self.assertEqual(term_doc_mat.get_categories()[:2], ['hamlet', 'jay-z/r. kelly']) self.assertEqual(term_doc_mat .get_term_freq_df() .assign(score=term_doc_mat.get_scaled_f_scores('hamlet')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['that', 'march', 'did', 'majesty', 'sometimes'])
Example #6
Source File: test_corpusFromScikit.py From scattertext with Apache License 2.0 | 6 votes |
def _te_ss_t_build(self): from sklearn.datasets import fetch_20newsgroups from sklearn.feature_extraction.text import CountVectorizer newsgroups_train = fetch_20newsgroups(subset='train', remove=('headers', 'footers', 'quotes')) count_vectorizer = CountVectorizer() X_counts = count_vectorizer.fit_transform(newsgroups_train.data) corpus = CorpusFromScikit( X=X_counts, y=newsgroups_train.target, feature_vocabulary=count_vectorizer.vocabulary_, category_names=newsgroups_train.target_names, raw_texts=newsgroups_train.data ).build() self.assertEqual(corpus.get_categories()[:2], ['alt.atheism', 'comp.graphics']) self.assertEqual(corpus .get_term_freq_df() .assign(score=corpus.get_scaled_f_scores('alt.atheism')) .sort_values(by='score', ascending=False).index.tolist()[:5], ['atheism', 'atheists', 'islam', 'atheist', 'belief']) self.assertGreater(len(corpus.get_texts()[0]), 5)
Example #7
Source File: testScoreWithAdapaLgbm.py From nyoka with Apache License 2.0 | 6 votes |
def test_04_lgbm_regressor(self): print("\ntest 04 (lgbm regressor with preprocessing)\n") auto = pd.read_csv('nyoka/tests/auto-mpg.csv') X = auto.drop(['mpg'], axis=1) y = auto['mpg'] feature_names = [name for name in auto.columns if name not in ('mpg')] target_name='mpg' x_train, x_test, y_train, y_test = train_test_split(X, y, test_size=0.33, random_state=101) pd.DataFrame(data=x_test, columns=feature_names).to_csv("test.csv",index=False) pipeline_obj = Pipeline([ ('mapper', DataFrameMapper([ ('car name', CountVectorizer()), (['displacement'],[StandardScaler()]) ])), ('lgbmr',LGBMRegressor()) ]) pipeline_obj.fit(x_train,y_train) file_name = "test04lgbm.pmml" lgb_to_pmml(pipeline_obj, feature_names, 'mpg', file_name) model_name = self.adapa_utility.upload_to_zserver(file_name) predictions, _ = self.adapa_utility.score_in_zserver(model_name, "test.csv") predictions = numpy.array(predictions) model_pred = pipeline_obj.predict(x_test) self.assertEqual(self.adapa_utility.compare_predictions(predictions, model_pred), True)
Example #8
Source File: test_pmi_w2l.py From chowmein with MIT License | 6 votes |
def test_from_texts(): cal = PMICalculator(doc2word_vectorizer=CountVectorizer(min_df=0), doc2label_vectorizer=LabelCountVectorizer()) actual = cal.from_texts(docs, labels) assert_equal(actual.shape[1], 4) assert_equal(actual.shape[0], 9) assert_equal(cal.index2word_, {0: u'information', 1: u'language', 2: u'learning', 3: u'machine', 4: u'mining', 5: u'natural', 6: u'processing', 7: u'retrieval', 8: u'text'}) assert_equal(cal.index2label_, {0: 'information retrieval'.split(), 1: 'machine learning'.split(), 2: 'natural language processing'.split(), 3: 'text mining'.split()})
Example #9
Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: """ Build SKLearn vectorization pipeline for this field. This is used in field-based machine learning when we calculate value of one field based on the values of other fields of this document. We are able to detect only choice fields this way at the moment. To reach this we need to build a feature vector of all dependencies of the field being detected. This feature vector is built as a union of feature vectors of each dependency. See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..) :return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline() 2. List of str feature names or a function returning list of str feature names. """ vect = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words=self._build_stop_words()) return [('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', vect), ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
Example #10
Source File: data_loader.py From PathCon with MIT License | 6 votes |
def read_relations(file_name): bow = [] count_vec = CountVectorizer() d = {} file = open(file_name) for line in file: index, name = line.strip().split('\t') d[name] = int(index) if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'): tokens = re.findall('[a-z]{2,}', name) bow.append(' '.join(tokens)) file.close() if args.feature_type == 'bow' and not os.path.exists('../data/' + args.dataset + '/bow.npy'): bow = count_vec.fit_transform(bow) np.save('../data/' + args.dataset + '/bow.npy', bow.toarray()) return d
Example #11
Source File: stop.py From cltk with MIT License | 6 votes |
def __init__(self, language=None): """ Initialize stoplist builder with option for language specific parameters :type language: str :param language : text from which to build the stoplist """ if language: self.language = language.lower() self.numpy_installed = True # Write utility for common import traps? self.sklearn_installed = True try: import numpy as np self.np = np except ImportError: self.numpy_installed = False try: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # self.vectorizer = CountVectorizer(input='content') # Set df? # self.tfidf_vectorizer = TfidfVectorizer() except ImportError: self.sklearn_installed = False
Example #12
Source File: common_utils.py From interpret-text with MIT License | 6 votes |
def create_logistic_vectorizer(): vectorizer = CountVectorizer(lowercase=False, min_df=0.0, binary=True) lr = LogisticRegression(random_state=777) return Pipeline([("vectorizer", vectorizer), ("lr", lr)])
Example #13
Source File: feature_extraction.py From fanci with GNU General Public License v3.0 | 6 votes |
def _n_grams(): """ Calculates various statistical features over the 1-,2- and 3-grams of the suffix and dot free domain :return: """ global __unigram feature = [] for i in range(1,4): ngram_vectorizer = CountVectorizer(analyzer='char', ngram_range=(i, i)) counts = ngram_vectorizer.build_analyzer()(__joined_dot_split_suffix_free) npa = numpy.array(list(Counter(counts).values()), dtype=int) if i == 1: __unigram = npa feature += __stats_over_n_grams(npa) return feature
Example #14
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def vectorizer_factory(self): return CountVectorizer(min_df=1, dtype=np.int32)
Example #15
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def vectorizer_factory(self): return CountVectorizer(min_df=1, dtype=np.int32)
Example #16
Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: count_vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words=self._build_stop_words(), preprocessor=vectorizers.set_items_as_tokens_preprocessor, tokenizer=vectorizers.set_items_as_tokens) return [('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', count_vectorizer), ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(count_vectorizer)
Example #17
Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: count_vectorizer = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words=self._build_stop_words(), preprocessor=vectorizers.set_items_as_tokens_preprocessor, tokenizer=vectorizers.set_items_as_tokens) return [('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', count_vectorizer), ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(count_vectorizer)
Example #18
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def vectorizer_factory(self): return CountVectorizer(min_df=1, dtype=np.int32)
Example #19
Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: vect = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words=self._build_stop_words(), tokenizer=vectorizers.whole_value_as_token) return [('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', vect), ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
Example #20
Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 5 votes |
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: vect = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words=self._build_stop_words(), tokenizer=vectorizers.whole_value_as_token) return [('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', vect), ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
Example #21
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def vectorizer_factory(self): return CountVectorizer(min_df=1, dtype=np.int32)
Example #22
Source File: stops.py From cltk with MIT License | 5 votes |
def __init__(self): BaseCorpusStoplist.__init__(self) self.punctuation = punctuation if self.numpy_installed and self.sklearn_installed: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer self.vectorizer = CountVectorizer(input='content') # Set df? self.tfidf_vectorizer = TfidfVectorizer(input='content')
Example #23
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def vectorizer_factory(self): return CountVectorizer(min_df=1, dtype=np.int32)
Example #24
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def vectorizer_factory(self): return CountVectorizer(min_df=1, dtype=np.int32)
Example #25
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def vectorizer_factory(self): return CountVectorizer(min_df=1)
Example #26
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def vectorizer_factory(self): return CountVectorizer(min_df=2)
Example #27
Source File: chi2_analyzer.py From assistant-dialog-skill-analysis with Apache License 2.0 | 5 votes |
def _preprocess_chi2(workspace_pd): """ Preprocess dataframe for chi2 analysis :param workspace_pd: Preprocess dataframe for chi2 :return labels: intents processed :return count_vectorizer: vectorizer instance :return features: features from transform """ stopword_list = skills_util.STOP_WORDS workspace_pd["utterance_punc_stripped"] = workspace_pd["utterance"].apply( strip_punctuations ) count_vectorizer = CountVectorizer( min_df=1, encoding="utf-8", ngram_range=(1, 2), stop_words=stopword_list, tokenizer=word_tokenize, token_pattern="(?u)\b\w+\b", ) features = count_vectorizer.fit_transform( workspace_pd["utterance_punc_stripped"] ).toarray() labels = workspace_pd["intent"] return labels, count_vectorizer, features
Example #28
Source File: classical_chinese.py From cltk with MIT License | 5 votes |
def __init__(self, language='classical_chinese'): BaseCorpusStoplist.__init__(self, language) self.punctuation = '。,;?:!、《》' if self.numpy_installed and self.sklearn_installed: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer self.vectorizer = CountVectorizer(analyzer='char', input='content') # Set df? self.tfidf_vectorizer = TfidfVectorizer(analyzer='char', input='content')
Example #29
Source File: latin.py From cltk with MIT License | 5 votes |
def __init__(self, language='latin'): BaseCorpusStoplist.__init__(self, language) self.punctuation = punctuation if self.numpy_installed and self.sklearn_installed: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer self.vectorizer = CountVectorizer(input='content') # Set df? self.tfidf_vectorizer = TfidfVectorizer(input='content') # Reference lists
Example #30
Source File: stops.py From cltk with MIT License | 5 votes |
def __init__(self,): BaseCorpusStoplist.__init__(self) self.punctuation = punctuation if not self.numpy_installed or not self.sklearn_installed: print('\n\nThe Corpus-based Stoplist method requires numpy and scikit-learn for calculations. ' 'Try installing with `pip install numpy sklearn scipy`.\n\n') raise ImportError else: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer self.vectorizer = CountVectorizer(input='content') # Set df? self.tfidf_vectorizer = TfidfVectorizer(input='content')