Python sklearn.feature_extraction.text.TfidfVectorizer() Examples
The following are 30
code examples of sklearn.feature_extraction.text.TfidfVectorizer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_extraction.text
, or try the search function
.
Example #1
Source File: 20newsgroup.py From OpenNE with MIT License | 9 votes |
def text_to_graph(text): import networkx as nx from sklearn.feature_extraction.text import TfidfVectorizer from sklearn.neighbors import kneighbors_graph # use tfidf to transform texts into feature vectors vectorizer = TfidfVectorizer() vectors = vectorizer.fit_transform(text) # build the graph which is full-connected N = vectors.shape[0] mat = kneighbors_graph(vectors, N, metric='cosine', mode='distance', include_self=True) mat.data = 1 - mat.data # to similarity g = nx.from_scipy_sparse_matrix(mat, create_using=nx.Graph()) return g
Example #2
Source File: DataModule.py From sgd-influence with MIT License | 8 votes |
def load(self): categories = ['comp.sys.ibm.pc.hardware', 'comp.sys.mac.hardware'] newsgroups_train = fetch_20newsgroups( subset='train', remove=('headers', 'footers', 'quotes'), categories=categories) newsgroups_test = fetch_20newsgroups( subset='test', remove=('headers', 'footers', 'quotes'), categories=categories) vectorizer = TfidfVectorizer(stop_words='english', min_df=0.001, max_df=0.20) vectors = vectorizer.fit_transform(newsgroups_train.data) vectors_test = vectorizer.transform(newsgroups_test.data) x1 = vectors y1 = newsgroups_train.target x2 = vectors_test y2 = newsgroups_test.target x = np.array(np.r_[x1.todense(), x2.todense()]) y = np.r_[y1, y2] return x, y
Example #3
Source File: test_shap.py From AIX360 with Apache License 2.0 | 8 votes |
def test_ShapLinearExplainer(self): corpus, y = shap.datasets.imdb() corpus_train, corpus_test, y_train, y_test = train_test_split(corpus, y, test_size=0.2, random_state=7) vectorizer = TfidfVectorizer(min_df=10) X_train = vectorizer.fit_transform(corpus_train) X_test = vectorizer.transform(corpus_test) model = sklearn.linear_model.LogisticRegression(penalty="l1", C=0.1, solver='liblinear') model.fit(X_train, y_train) shapexplainer = LinearExplainer(model, X_train, feature_dependence="independent") shap_values = shapexplainer.explain_instance(X_test) print("Invoked Shap LinearExplainer") # comment this test as travis runs out of resources
Example #4
Source File: tfidf.py From qb with MIT License | 7 votes |
def train(self, training_data) -> None: questions = training_data[0] answers = training_data[1] answer_docs = defaultdict(str) for q, ans in zip(questions, answers): text = ' '.join(q) answer_docs[ans] += ' ' + text x_array = [] y_array = [] for ans, doc in answer_docs.items(): x_array.append(doc) y_array.append(ans) self.i_to_ans = {i: ans for i, ans in enumerate(y_array)} self.tfidf_vectorizer = TfidfVectorizer( ngram_range=(1, 3), min_df=2, max_df=.9 ).fit(x_array) self.tfidf_matrix = self.tfidf_vectorizer.transform(x_array)
Example #5
Source File: make_handcrafted_33_features.py From wsdm19cup with MIT License | 7 votes |
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True): """return 4 tensors of train_q1,q2 and test_q1,q2""" df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("") df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("") df = pd.DataFrame() df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique() if to_preprocess: df['text'] = df['text'].map(lambda x: preprocess(x)) df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess) df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess) df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess) df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess) if analyzer == 'char': vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) else: vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) vect.fit(df["text"].tolist()) return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect
Example #6
Source File: 04_sent.py From Building-Machine-Learning-Systems-With-Python-Second-Edition with MIT License | 6 votes |
def create_union_model(params=None): def preprocessor(tweet): tweet = tweet.lower() for k in emo_repl_order: tweet = tweet.replace(k, emo_repl[k]) for r, repl in re_repl.iteritems(): tweet = re.sub(r, repl, tweet) return tweet.replace("-", " ").replace("_", " ") tfidf_ngrams = TfidfVectorizer(preprocessor=preprocessor, analyzer="word") ling_stats = LinguisticVectorizer() all_features = FeatureUnion( [('ling', ling_stats), ('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('tfidf', tfidf_ngrams)]) #all_features = FeatureUnion([('ling', ling_stats)]) clf = MultinomialNB() pipeline = Pipeline([('all', all_features), ('clf', clf)]) if params: pipeline.set_params(**params) return pipeline
Example #7
Source File: train_predict_trees_batch2.py From wsdm19cup with MIT License | 6 votes |
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True): """return 4 tensors of train_q1,q2 and test_q1,q2""" df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("") df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("") df = pd.DataFrame() df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique() if to_preprocess: df['text'] = df['text'].map(lambda x: preprocess(x)) df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess) df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess) df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess) df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess) if analyzer == 'char': vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) else: vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) vect.fit(df["text"].tolist()) return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect
Example #8
Source File: train_predict_trees_batch1.py From wsdm19cup with MIT License | 6 votes |
def extract_bow(train_file="../data/train.csv", test_file="../data/test.csv", analyzer='char', ngram_range=(1, 1), stop_words=[], min_df=1, max_features=10000,use_idf=True, to_preprocess=True): """return 4 tensors of train_q1,q2 and test_q1,q2""" df_train = pd.read_csv(train_file, usecols=['title1_zh', 'title2_zh']).fillna("") df_test = pd.read_csv(test_file, usecols=['title1_zh', 'title2_zh']).fillna("") df = pd.DataFrame() df['text'] = pd.Series(df_train['title1_zh'].tolist() + df_train['title2_zh'].tolist() + df_test['title1_zh'].tolist() + df_test['title2_zh'].tolist()).unique() if to_preprocess: df['text'] = df['text'].map(lambda x: preprocess(x)) df_train['title1_zh'] = df_train['title1_zh'].apply(preprocess) df_train['title2_zh'] = df_train['title2_zh'].apply(preprocess) df_test['title1_zh'] = df_test['title1_zh'].apply(preprocess) df_test['title2_zh'] = df_test['title2_zh'].apply(preprocess) if analyzer == 'char': vect = TfidfVectorizer(analyzer=analyzer, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) else: vect = TfidfVectorizer(analyzer=analyzer, tokenizer=jieba.cut, ngram_range=ngram_range, stop_words=stop_words, min_df=min_df, max_features=max_features, use_idf=use_idf) vect.fit(df["text"].tolist()) return vect.transform(df_train.title1_zh),vect.transform(df_train.title2_zh), vect.transform(df_test.title1_zh),vect.transform(df_test.title2_zh), vect
Example #9
Source File: preprocess.py From cn-text-classifier with GNU General Public License v3.0 | 6 votes |
def extract_characters(sentences: list, dimension: int): """ vertorizer :param sentences: list :param dimension: int :return: weight, training_data """ print("Vetorizier...") # Transfer into frequency matrix a[i][j], word j in text class i frequency vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46) # vertorizer = CountVectorizer() # collect tf-idf weight transformer = TfidfTransformer() # outer transform for calculate tf-idf, second for transform into matrix tfidf = transformer.fit_transform(vertorizer.fit_transform(sentences)) # get all words in BOW words_bag = vertorizer.get_feature_names() # w[i][j] represents word j's weight in text class i weight = tfidf.toarray() print('Features length:' + str(len(words_bag))) pca = PCA(n_components=dimension) training_data = pca.fit_transform(weight) return weight, training_data
Example #10
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_vectorizer_stop_words_inconsistent(): lstr = "['and', 'll', 've']" message = ('Your stop_words may be inconsistent with your ' 'preprocessing. Tokenizing the stop words generated ' 'tokens %s not in stop_words.' % lstr) for vec in [CountVectorizer(), TfidfVectorizer(), HashingVectorizer()]: vec.set_params(stop_words=["you've", "you", "you'll", 'AND']) assert_warns_message(UserWarning, message, vec.fit_transform, ['hello world']) # reset stop word validation del vec._stop_words_id assert _check_stop_words_consistency(vec) is False # Only one warning per stop list assert_no_warnings(vec.fit_transform, ['hello world']) assert _check_stop_words_consistency(vec) is None # Test caching of inconsistency assessment vec.set_params(stop_words=["you've", "you", "you'll", 'blah', 'AND']) assert_warns_message(UserWarning, message, vec.fit_transform, ['hello world'])
Example #11
Source File: language_detector.py From text-mining-class with MIT License | 6 votes |
def build_language_classifier(texts, labels, verbose=False, random_state=None): """Train a text classifier with scikit-learn The text classifier is composed of two elements assembled in a pipeline: - A text feature extractor (`TfidfVectorizer`) that extract the relative frequencies of unigrams, bigrams and trigrams of characters in the text. - An instance of `SGDClassifier` for the classification it-self. To speed up training it is recommended to enable early stopping. `random_state` is passed to the underlying `SGDClassifier` instance. """ language_classifier = make_pipeline( TfidfVectorizer(analyzer="char", ngram_range=(1, 3), min_df=2, max_df=0.9, norm="l2", dtype=np.float32), SGDClassifier(early_stopping=True, validation_fraction=0.2, n_iter_no_change=3, max_iter=1000, tol=1e-3, alpha=1e-5, penalty="l2", verbose=verbose, random_state=random_state) ) return language_classifier.fit(texts, labels)
Example #12
Source File: feature.py From text-classifier with Apache License 2.0 | 6 votes |
def tfidf_word_feature(self, data_set): """ Get TFIDF ngram feature by word :param data_set: :return: """ data_set = get_word_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = TfidfVectorizer(analyzer='word', vocabulary=self.word_vocab, sublinear_tf=True) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) logger.info(data_feature.shape) # if not self.is_infer: save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
Example #13
Source File: feature.py From text-classifier with Apache License 2.0 | 6 votes |
def tfidf_char_feature(self, data_set): """ Get TFIDF feature by char :param data_set: :return: """ data_set = get_char_segment_data(data_set) if self.is_infer: self.vectorizer = load_pkl(self.feature_vec_path) data_feature = self.vectorizer.transform(data_set) else: self.vectorizer = TfidfVectorizer(analyzer='char', ngram_range=(1, 2), sublinear_tf=True) data_feature = self.vectorizer.fit_transform(data_set) vocab = self.vectorizer.vocabulary_ logger.info('Vocab size:%d' % len(vocab)) logger.info(data_feature.shape) if not self.is_infer: save_pkl(self.vectorizer, self.feature_vec_path, overwrite=True) return data_feature
Example #14
Source File: cluster.py From text-classifier with Apache License 2.0 | 6 votes |
def feature(feature_file_path): if not os.path.exists(feature_file_path): stopwords = read_words(stop_words_path) word_set, docs = segment(input_file_path, stopwords=stopwords) tfidf_vectorizer = TfidfVectorizer(max_df=0.9, min_df=0.1, analyzer='word', ngram_range=(1, 2), vocabulary=list(word_set)) feature_matrix = tfidf_vectorizer.fit_transform(docs) # fit the vectorizer to synopses # terms is just a 集合 of the features used in the tf-idf matrix. This is a vocabulary terms = tfidf_vectorizer.get_feature_names() # 长度258 print('vocab name size:%s' % len(terms)) print(terms[:10]) with open(feature_file_path, 'wb') as f: pickle.dump(feature_matrix, f) else: with open(feature_file_path, "rb") as f: feature_matrix = pickle.load(f) print(feature_matrix.shape) # (10, 258):10篇文档,258个feature return feature_matrix
Example #15
Source File: textrank.py From nlg-yongzhuo with MIT License | 6 votes |
def tdidf_sim(sentences): """ tfidf相似度 :param sentences: :return: """ # tfidf计算 model = TfidfVectorizer(tokenizer=jieba.cut, ngram_range=(1, 2), # 3,5 stop_words=[' ', '\t', '\n'], # 停用词 max_features=10000, token_pattern=r"(?u)\b\w+\b", # 过滤停用词 min_df=1, max_df=0.9, use_idf=1, # 光滑 smooth_idf=1, # 光滑 sublinear_tf=1, ) # 光滑 matrix = model.fit_transform(sentences) matrix_norm = TfidfTransformer().fit_transform(matrix) return matrix_norm
Example #16
Source File: textrank_sklearn.py From nlg-yongzhuo with MIT License | 6 votes |
def tdidf_sim(sentences): """ tfidf相似度 :param sentences: :return: """ # tfidf计算 model = TfidfVectorizer(tokenizer=jieba.cut, ngram_range=(1, 2), # 3,5 stop_words=[' ', '\t', '\n'], # 停用词 max_features=10000, token_pattern=r"(?u)\b\w+\b", # 过滤停用词 min_df=1, max_df=0.9, use_idf=1, # 光滑 smooth_idf=1, # 光滑 sublinear_tf=1, ) # 光滑 matrix = model.fit_transform(sentences) matrix_norm = TfidfTransformer().fit_transform(matrix) return matrix_norm
Example #17
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def _init_word_ngram_tfidf(self, ngram, vocabulary=None): tfidf = TfidfVectorizer(min_df=3, max_df=0.75, max_features=None, norm="l2", strip_accents="unicode", analyzer="word", token_pattern=r"\w{1,}", ngram_range=(1, ngram), use_idf=1, smooth_idf=1, sublinear_tf=1, # stop_words="english", vocabulary=vocabulary) return tfidf ## char based
Example #18
Source File: topic_nmf.py From nlg-yongzhuo with MIT License | 6 votes |
def tfidf_fit(sentences): """ tfidf相似度 :param sentences: :return: """ # tfidf计算 model = TfidfVectorizer(ngram_range=(1, 2), # 3,5 stop_words=[' ', '\t', '\n'], # 停用词 max_features=10000, token_pattern=r"(?u)\b\w+\b", # 过滤停用词 min_df=1, max_df=0.9, use_idf=1, # 光滑 smooth_idf=1, # 光滑 sublinear_tf=1, ) # 光滑 matrix = model.fit_transform(sentences) return matrix
Example #19
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def _init_char_ngram_tfidf(self, ngram, vocabulary=None): tfidf = TfidfVectorizer(min_df=3, max_df=0.75, max_features=None, norm="l2", strip_accents="unicode", analyzer="char", token_pattern=r"\w{1,}", ngram_range=(1, ngram), use_idf=1, smooth_idf=1, sublinear_tf=1, # stop_words="english", vocabulary=vocabulary) return tfidf # ------------------------ LSA -------------------------------
Example #20
Source File: stop.py From cltk with MIT License | 6 votes |
def __init__(self, language=None): """ Initialize stoplist builder with option for language specific parameters :type language: str :param language : text from which to build the stoplist """ if language: self.language = language.lower() self.numpy_installed = True # Write utility for common import traps? self.sklearn_installed = True try: import numpy as np self.np = np except ImportError: self.numpy_installed = False try: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer # self.vectorizer = CountVectorizer(input='content') # Set df? # self.tfidf_vectorizer = TfidfVectorizer() except ImportError: self.sklearn_installed = False
Example #21
Source File: text_preprocess.py From nlg-yongzhuo with MIT License | 6 votes |
def tfidf_fit(sentences): """ tfidf相似度 :param sentences: :return: """ # tfidf计算 model = TfidfVectorizer(ngram_range=(1, 2), # 3,5 stop_words=[' ', '\t', '\n'], # 停用词 max_features=10000, token_pattern=r"(?u)\b\w+\b", # 过滤停用词 min_df=1, max_df=0.9, use_idf=1, # 光滑 smooth_idf=1, # 光滑 sublinear_tf=1, ) # 光滑 matrix = model.fit_transform(sentences) return matrix
Example #22
Source File: _validateSchema.py From nyoka with Apache License 2.0 | 6 votes |
def test_validate_sklearn_sgd_with_text(self): categories = ['alt.atheism','talk.religion.misc'] data = fetch_20newsgroups(subset='train', categories=categories) X = data.data[:4] Y = data.target[:4] features = ['input'] target = 'output' model = SGDClassifier(loss="log") file_name = model.__class__.__name__ + '_TfIdfVec_.pmml' pipeline = Pipeline([ ('vect', TfidfVectorizer()), ('clf', model) ]) pipeline.fit(X, Y) skl_to_pmml(pipeline, features , target, file_name) self.assertEqual(self.schema.is_valid(file_name), True)
Example #23
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_tfidfvectorizer_export_idf(): vect = TfidfVectorizer(use_idf=True) vect.fit(JUNK_FOOD_DOCS) assert_array_almost_equal(vect.idf_, vect._tfidf.idf_)
Example #24
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_vectorizer_vocab_clone(): vect_vocab = TfidfVectorizer(vocabulary=["the"]) vect_vocab_clone = clone(vect_vocab) vect_vocab.fit(ALL_FOOD_DOCS) vect_vocab_clone.fit(ALL_FOOD_DOCS) assert_equal(vect_vocab_clone.vocabulary_, vect_vocab.vocabulary_)
Example #25
Source File: svm_classifier.py From nlp-journey with Apache License 2.0 | 5 votes |
def __select_features(data_set): dataset = [clean_en_text(data) for data in data_set[0]] tf_idf_model = TfidfVectorizer(ngram_range=(1, 1), binary=True, sublinear_tf=True) tf_vectors = tf_idf_model.fit_transform(dataset) # 选出前1/5的词用来做特征 k = int(tf_vectors.shape[1] / 6) chi_model = SelectKBest(chi2, k=k) chi_features = chi_model.fit_transform(tf_vectors, data_set[1]) print('tf-idf:\t\t' + str(tf_vectors.shape[1])) print('chi:\t\t' + str(chi_features.shape[1])) return chi_features, tf_idf_model, chi_model
Example #26
Source File: text_char_tfidf_count_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def fit_transform(self, X: dt.Frame, y: np.array = None): X = X.to_pandas().astype(str).iloc[:, 0].fillna("NA") # TFIDF Vectorizer self.tfidf_vec = TfidfVectorizer(analyzer="char", ngram_range=(1, self.max_ngram)) X = self.tfidf_vec.fit_transform(X) # Truncated SVD if len(self.tfidf_vec.vocabulary_) <= self.n_svd_comp: self.n_svd_comp = len(self.tfidf_vec.vocabulary_) - 1 self.truncated_svd = TruncatedSVD(n_components=self.n_svd_comp, random_state=2019) X = self.truncated_svd.fit_transform(X) return X
Example #27
Source File: text_tfidf_model.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def fit(self, X, y, sample_weight=None, eval_set=None, sample_weight_eval_set=None, **kwargs): orig_cols = list(X.names) if self.num_classes >= 2: lb = LabelEncoder() lb.fit(self.labels) y = lb.transform(y) model = LogisticRegression(random_state=2019) else: model = LinearRegression() self.tfidf_objs = [] new_X = None for col in X.names: XX = X[:, col].to_pandas() XX = XX[col].astype(str).fillna("NA").values.tolist() tfidf_vec = TfidfVectorizer(**self.params) XX = tfidf_vec.fit_transform(XX) self.tfidf_objs.append(tfidf_vec) if new_X is None: new_X = XX else: new_X = sp.sparse.hstack([new_X, XX]) model.fit(new_X, y) model = (model, self.tfidf_objs) self.tfidf_objs = [] importances = [1] * len(orig_cols) self.set_model_properties(model=model, features=orig_cols, importances=importances, iterations=0)
Example #28
Source File: learn.py From partisan-discourse with Apache License 2.0 | 5 votes |
def construct_pipeline(classifier): """ This function creates a feature extraction pipeline that accepts data from a CorpusLoader and appends the classification model to the end of the pipeline, returning a newly constructed Pipeline object that is ready to be fit and trained! """ return Pipeline([ # Create a Feature Union of Text Stats and Bag of Words ('union', FeatureUnion( transformer_list = [ # Pipeline for pulling document structure features ('stats', Pipeline([ ('stats', TextStats()), ('vect', DictVectorizer()), ])), # Pipeline for creating a bag of words TF-IDF vector ('bow', Pipeline([ ('tokens', TextNormalizer()), ('tfidf', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False )), ('best', TruncatedSVD(n_components=1000)), ])), ], # weight components in feature union transformer_weights = { 'stats': 0.15, 'bow': 0.85, }, )), # Append the estimator to the end of the pipeline ('classifier', classifier), ])
Example #29
Source File: cluster-by-tags.py From TGIF-Release with BSD 3-Clause "New" or "Revised" License | 5 votes |
def main(): import sys fn, tags = load_tags(sys.argv[1], sys.argv[2]) vectorizer = TfidfVectorizer(max_df=0.95, min_df=2) tfidf = vectorizer.fit_transform(tags) cls = KMeans(init='k-means++', n_clusters = 20, n_init=10) cls.fit(tfidf) for gif, l in zip(fn, cls.labels_): print gif, l pass
Example #30
Source File: stops.py From cltk with MIT License | 5 votes |
def __init__(self,): BaseCorpusStoplist.__init__(self) self.punctuation = punctuation if not self.numpy_installed or not self.sklearn_installed: print('\n\nThe Corpus-based Stoplist method requires numpy and scikit-learn for calculations. ' 'Try installing with `pip install numpy sklearn scipy`.\n\n') raise ImportError else: from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer self.vectorizer = CountVectorizer(input='content') # Set df? self.tfidf_vectorizer = TfidfVectorizer(input='content')