Python sklearn.feature_extraction.text.TfidfTransformer() Examples
The following are 30
code examples of sklearn.feature_extraction.text.TfidfTransformer().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.feature_extraction.text
, or try the search function
.
Example #1
Source File: get_topic.py From poem_generator with Apache License 2.0 | 7 votes |
def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words): count_matrix = count_vect.fit_transform(poem_words) tfidf = TfidfTransformer().fit_transform(count_matrix) nmf = decomposition.NMF(n_components=n_topic).fit(tfidf) feature_names = count_vect.get_feature_names() fw = codecs.open(ftopics, 'w', 'utf-8') for topic in nmf.components_: fw.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]]) + '\n') fw.close() print('Write topics done.') fw = codecs.open(fwords, 'wb') pickle.dump(feature_names, fw) fw.close() print('Write words done.') fw = codecs.open(ftopics_words, 'wb') pickle.dump(nmf.components_, fw) fw.close() print('Write topic_words done.')
Example #2
Source File: TermDocMatrix.py From scattertext with Apache License 2.0 | 6 votes |
def get_logistic_regression_coefs_l2(self, category, clf=RidgeClassifierCV()): ''' Computes l2-penalized logistic regression score. Parameters ---------- category : str category name to score category : str category name to score Returns ------- (coefficient array, accuracy, majority class baseline accuracy) ''' try: from sklearn.cross_validation import cross_val_predict except: from sklearn.model_selection import cross_val_predict y = self._get_mask_from_category(category) X = TfidfTransformer().fit_transform(self._X) clf.fit(X, y) y_hat = cross_val_predict(clf, X, y) acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat) return clf.coef_[0], acc, baseline
Example #3
Source File: textrank.py From nlg-yongzhuo with MIT License | 6 votes |
def tdidf_sim(sentences): """ tfidf相似度 :param sentences: :return: """ # tfidf计算 model = TfidfVectorizer(tokenizer=jieba.cut, ngram_range=(1, 2), # 3,5 stop_words=[' ', '\t', '\n'], # 停用词 max_features=10000, token_pattern=r"(?u)\b\w+\b", # 过滤停用词 min_df=1, max_df=0.9, use_idf=1, # 光滑 smooth_idf=1, # 光滑 sublinear_tf=1, ) # 光滑 matrix = model.fit_transform(sentences) matrix_norm = TfidfTransformer().fit_transform(matrix) return matrix_norm
Example #4
Source File: textrank_sklearn.py From nlg-yongzhuo with MIT License | 6 votes |
def tdidf_sim(sentences): """ tfidf相似度 :param sentences: :return: """ # tfidf计算 model = TfidfVectorizer(tokenizer=jieba.cut, ngram_range=(1, 2), # 3,5 stop_words=[' ', '\t', '\n'], # 停用词 max_features=10000, token_pattern=r"(?u)\b\w+\b", # 过滤停用词 min_df=1, max_df=0.9, use_idf=1, # 光滑 smooth_idf=1, # 光滑 sublinear_tf=1, ) # 光滑 matrix = model.fit_transform(sentences) matrix_norm = TfidfTransformer().fit_transform(matrix) return matrix_norm
Example #5
Source File: test_termDocMatrixFactory.py From scattertext with Apache License 2.0 | 6 votes |
def test_main(self): categories, documents = get_docs_categories() clean_function = lambda text: '' if text.startswith('[') else text entity_types = set(['GPE']) term_doc_mat = ( TermDocMatrixFactory( category_text_iter=zip(categories, documents), clean_function=clean_function, nlp=_testing_nlp, feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types) ).build() ) clf = PassiveAggressiveClassifier() fdc = FeatsFromDoc(term_doc_mat._term_idx_store, clean_function=clean_function, feats_from_spacy_doc=FeatsFromSpacyDoc( entity_types_to_censor=entity_types)).set_nlp(_testing_nlp) tfidf = TfidfTransformer(norm='l1') X = tfidf.fit_transform(term_doc_mat._X) clf.fit(X, term_doc_mat._y) X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD') pred = clf.predict(tfidf.transform(X_to_predict)) dec = clf.decision_function(X_to_predict)
Example #6
Source File: centroid_word_embeddings.py From text-summarizer with GNU General Public License v3.0 | 6 votes |
def get_topic_idf(self, sentences): vectorizer = CountVectorizer() sent_word_matrix = vectorizer.fit_transform(sentences) transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False) tfidf = transformer.fit_transform(sent_word_matrix) tfidf = tfidf.toarray() centroid_vector = tfidf.sum(0) centroid_vector = np.divide(centroid_vector, centroid_vector.max()) # print(centroid_vector.max()) feature_names = vectorizer.get_feature_names() relevant_vector_indices = np.where(centroid_vector > self.topic_threshold)[0] word_list = list(np.array(feature_names)[relevant_vector_indices]) return word_list
Example #7
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_tf_idf_smoothing(): X = [[1, 1, 1], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm='l2') tfidf = tr.fit_transform(X).toarray() assert (tfidf >= 0).all() # check normalization assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.]) # this is robust to features with only zeros X = [[1, 1, 0], [1, 1, 0], [1, 0, 0]] tr = TfidfTransformer(smooth_idf=True, norm='l2') tfidf = tr.fit_transform(X).toarray() assert (tfidf >= 0).all()
Example #8
Source File: preprocess.py From cn-text-classifier with GNU General Public License v3.0 | 6 votes |
def extract_characters(sentences: list, dimension: int): """ vertorizer :param sentences: list :param dimension: int :return: weight, training_data """ print("Vetorizier...") # Transfer into frequency matrix a[i][j], word j in text class i frequency vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46) # vertorizer = CountVectorizer() # collect tf-idf weight transformer = TfidfTransformer() # outer transform for calculate tf-idf, second for transform into matrix tfidf = transformer.fit_transform(vertorizer.fit_transform(sentences)) # get all words in BOW words_bag = vertorizer.get_feature_names() # w[i][j] represents word j's weight in text class i weight = tfidf.toarray() print('Features length:' + str(len(words_bag))) pca = PCA(n_components=dimension) training_data = pca.fit_transform(weight) return weight, training_data
Example #9
Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0 | 6 votes |
def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]: """ Build SKLearn vectorization pipeline for this field. This is used in field-based machine learning when we calculate value of one field based on the values of other fields of this document. We are able to detect only choice fields this way at the moment. To reach this we need to build a feature vector of all dependencies of the field being detected. This feature vector is built as a union of feature vectors of each dependency. See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..) :return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline() 2. List of str feature names or a function returning list of str feature names. """ vect = CountVectorizer(strip_accents='unicode', analyzer='word', stop_words=self._build_stop_words()) return [('clean', vectorizers.ReplaceNoneTransformer('')), ('vect', vect), ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)
Example #10
Source File: enhance_marko.py From nlp_xiaojiang with MIT License | 6 votes |
def get_keyword_from_tf(sentences, p): """ 获取某个类型下语料的热词 :param sentences: list, cuted sentences, filter by " " :param p: float, rate, 0 < p < 1 :return: list, words """ sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences] # token_pattern指定统计词频的模式, 不指定, 默认如英文, 不统计单字 vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b') # norm=None对词频结果不归一化 # use_idf=False, 因为使用的是计算tfidf的函数, 所以要忽略idf的计算 transformer = TfidfTransformer(norm=None, use_idf=False) vectorizer.fit_transform(sentence_cut_list) # tf = transformer.fit_transform(vectorizer.fit_transform(sentence_cut_list)) word = vectorizer.get_feature_names() # weight = tf.toarray() return word[-int(len(word) * p):]
Example #11
Source File: utils.py From GGP with Apache License 2.0 | 6 votes |
def load_data_ssl(data_name): adj_csr, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(data_name) adj_mat = np.asarray(adj_csr.toarray(), dtype=np_float_type) x_tr = np.reshape(np.arange(len(train_mask))[train_mask], (-1, 1)) x_val = np.reshape(np.arange(len(val_mask))[val_mask], (-1, 1)) x_test = np.reshape(np.arange(len(test_mask))[test_mask], (-1, 1)) y_tr = np.asarray(y_train[train_mask], dtype=np.int32) y_tr = np.reshape(np.sum(np.tile(np.arange(y_tr.shape[1]), (np.sum(train_mask), 1)) * y_tr, axis=1), (-1, 1)) y_val = np.asarray(y_val[val_mask], dtype=np.int32) y_val = np.reshape(np.sum(np.tile(np.arange(y_val.shape[1]), (np.sum(val_mask), 1)) * y_val, axis=1), (-1, 1)) y_test = np.asarray(y_test[test_mask], dtype=np.int32) y_test = np.reshape(np.sum(np.tile(np.arange(y_test.shape[1]), (np.sum(test_mask), 1)) * y_test, axis=1), (-1, 1)) node_features = features.toarray() if data_name.lower() != 'pubmed': #pubmed already comes with tf-idf transformer = TfidfTransformer(smooth_idf=True) node_features = transformer.fit_transform(node_features).toarray() return adj_mat, node_features, x_tr, y_tr, x_val, y_val, x_test, y_test
Example #12
Source File: test_explanation_model.py From cxplain with MIT License | 6 votes |
def test_nlp_not_padded_invalid(self): num_words = 1024 (x_train, y_train), (_, _) = TestUtil.get_random_variable_length_dataset(max_value=num_words) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = WordDropMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss) with self.assertRaises(ValueError): explainer.fit(x_train, y_train)
Example #13
Source File: datasets.py From ann-benchmarks with MIT License | 6 votes |
def transform_bag_of_words(filename, n_dimensions, out_fn): import gzip from scipy.sparse import lil_matrix from sklearn.feature_extraction.text import TfidfTransformer from sklearn import random_projection with gzip.open(filename, 'rb') as f: file_content = f.readlines() entries = int(file_content[0]) words = int(file_content[1]) file_content = file_content[3:] # strip first three entries print("building matrix...") A = lil_matrix((entries, words)) for e in file_content: doc, word, cnt = [int(v) for v in e.strip().split()] A[doc - 1, word - 1] = cnt print("normalizing matrix entries with tfidf...") B = TfidfTransformer().fit_transform(A) print("reducing dimensionality...") C = random_projection.GaussianRandomProjection( n_components=n_dimensions).fit_transform(B) X_train, X_test = train_test_split(C) write_output(numpy.array(X_train), numpy.array( X_test), out_fn, 'angular')
Example #14
Source File: relevance.py From PointerSQL with MIT License | 6 votes |
def question_classifier(data): questions = [i[1] for i in data] sql_type = [i[2].split(' ')[1] for i in data] sql_type_set = set(sql_type) sql_classes = dict([(type, i) for i, type in enumerate(sql_type_set)]) target = np.array([sql_classes[i] for i in sql_type]) sql_type_to_indices = {} for type in sql_type_set: sql_type_to_indices[type] = [idx for idx, i in enumerate(sql_type) if i == type] # Build classifier # TODO better ones text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', svm.LinearSVC())]) text_clf.fit(questions, target) predicted = text_clf.predict(questions) print('Training Acc.: %f' %(np.mean(predicted == target))) return sql_type_to_indices, text_clf
Example #15
Source File: WGGraph.py From AbTextSumm with Mozilla Public License 2.0 | 6 votes |
def removeSimilarSentences(generatedSentences, originalSentences, stopwords,threshold=0.80,): docs=[] for sent, sim in generatedSentences: docs.append(sent) docs.extend(originalSentences) bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs) normalized = TfidfTransformer().fit_transform(bow_matrix) #simMatrix = (normalized[0:] * normalized[0:].T).A simindices=[] #print 'Num original, ', len(originalSentences) for i in xrange(len(generatedSentences)): simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten() if(max(simGeneratedScores) >= threshold): simindices.append(i) #print simindices finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices] #print len(generatedSentences), len(finalGen) return finalGen
Example #16
Source File: bow.py From broca with MIT License | 6 votes |
def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False): """ `min_df` is set to filter out extremely rare words, since we don't want those to dominate the distance metric. `max_df` is set to filter out extremely common words, since they don't convey much information. """ # Wrap the specified tokenizer t = Tokenizer(tokenizer()) if hash: vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t) else: vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df) args = [ ('vectorizer', vectr), ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)), ('normalizer', Normalizer(copy=False)) ] self.pipeline = Pipeline(args) self.trained = False
Example #17
Source File: classifiers.py From fine-grained-sentiment with MIT License | 6 votes |
def __init__(self, model_file: str=None) -> None: super().__init__() # pip install sklearn from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import LogisticRegression from sklearn.pipeline import Pipeline self.pipeline = Pipeline( [ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', LogisticRegression( solver='newton-cg', multi_class='multinomial', random_state=42, max_iter=100, )), ] )
Example #18
Source File: classifiers.py From fine-grained-sentiment with MIT License | 6 votes |
def __init__(self, model_file: str=None) -> None: super().__init__() # pip install sklearn from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import SGDClassifier from sklearn.pipeline import Pipeline self.pipeline = Pipeline( [ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier( loss='hinge', penalty='l2', alpha=1e-3, random_state=42, max_iter=100, learning_rate='optimal', tol=None, )), ] )
Example #19
Source File: explainer.py From fine-grained-sentiment with MIT License | 6 votes |
def train(self) -> sklearn.pipeline.Pipeline: "Create sklearn logistic regression model pipeline" from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer from sklearn.linear_model import SGDClassifier from sklearn.pipeline import Pipeline pipeline = Pipeline( [ ('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier( loss='modified_huber', penalty='l2', alpha=1e-3, random_state=42, max_iter=100, tol=None, )), ] ) # Train model classifier = pipeline.fit(self.train_df['text'], self.train_df['truth']) return classifier
Example #20
Source File: svmclassifier.py From SQG with GNU General Public License v3.0 | 5 votes |
def __init__(self, model_file_path=None): super(SVMClassifier, self).__init__(model_file_path) self.pipeline = Pipeline([('vect', CountVectorizer()), ('tf-idf', TfidfTransformer()), ('svm', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))]) self.parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tf-idf__use_idf': (True, False), 'svm__alpha': (1e-2, 1e-3)}
Example #21
Source File: test_explanation_model.py From cxplain with MIT License | 5 votes |
def test_imdb_padded_valid(self): num_samples = 32 num_words = 1024 (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words, num_subsamples=num_samples) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = WordDropMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss) x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int) x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1]) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
Example #22
Source File: test_explanation_model.py From cxplain with MIT License | 5 votes |
def test_nlp_padded_valid(self): num_words = 1024 (x_train, y_train), (x_test, y_test) = TestUtil.get_random_variable_length_dataset(max_value=num_words) explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1) counter = CountVectoriser(num_words) tfidf_transformer = TfidfTransformer() explained_model = Pipeline([('counts', counter), ('tfidf', tfidf_transformer), ('model', explained_model)]) explained_model.fit(x_train, y_train) model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True, num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0, batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128) masking_operation = WordDropMasking() loss = binary_crossentropy explainer = CXPlain(explained_model, model_builder, masking_operation, loss) x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int) x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1]) explainer.fit(x_train, y_train) eval_score = explainer.score(x_test, y_test) train_score = explainer.get_last_fit_score() median = explainer.predict(x_test) self.assertTrue(median.shape == x_test.shape)
Example #23
Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_countvectorizer_custom_vocabulary_pipeline(): what_we_like = ["pizza", "beer"] pipe = Pipeline([ ('count', CountVectorizer(vocabulary=what_we_like)), ('tfidf', TfidfTransformer())]) X = pipe.fit_transform(ALL_FOOD_DOCS) assert_equal(set(pipe.named_steps['count'].vocabulary_), set(what_we_like)) assert_equal(X.shape[1], len(what_we_like))
Example #24
Source File: svm.py From opentc with MIT License | 5 votes |
def fit(self, dataset, filename): self.logger.debug("fit") self.clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer()), ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)), ]) self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target']) joblib.dump(self.clf, filename + ".pkl", compress=9)
Example #25
Source File: naivebayesclassifier.py From SQG with GNU General Public License v3.0 | 5 votes |
def __init__(self, model_file_path=None): super(NaiveBayesClassifier, self).__init__(model_file_path) self.pipeline = Pipeline( [('vect', CountVectorizer()), ('tf-idf', TfidfTransformer()), ('naive-bayes', MultinomialNB())]) self.parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tf-idf__use_idf': (True, False), 'naive-bayes__alpha': (1e-2, 1e-3)}
Example #26
Source File: _preprocessor.py From sumpy with Apache License 2.0 | 5 votes |
def build(self): if not hasattr(self, "_tfidf_transformer"): self._tfidf_transformer = None if self._tfidf_transformer is None: self._tfidf_transformer = TfidfTransformer() #input=u"content", preprocessor=lambda x: x, #tokenizer=lambda x: x)
Example #27
Source File: SentencesForTopicModeling.py From scattertext with Apache License 2.0 | 5 votes |
def get_topics_from_model( self, pipe=Pipeline([ ('tfidf', TfidfTransformer(sublinear_tf=True)), ('nmf', (NMF(n_components=30, alpha=.1, l1_ratio=.5, random_state=0)))]), num_terms_per_topic=10): ''' Parameters ---------- pipe : Pipeline For example, `Pipeline([ ('tfidf', TfidfTransformer(sublinear_tf=True)), ('nmf', (NMF(n_components=30, alpha=.1, l1_ratio=.5, random_state=0)))])` The last transformer must populate a `components_` attribute when finished. num_terms_per_topic : int Returns ------- dict: {term: [term1, ...], ...} ''' pipe.fit_transform(self.sentX) topic_model = {} for topic_idx, topic in enumerate(pipe._final_estimator.components_): term_list = [self.termidxstore.getval(i) for i in topic.argsort()[:-num_terms_per_topic - 1:-1] if topic[i] > 0] if len(term_list) > 0: topic_model['%s. %s' % (topic_idx, term_list[0])] = term_list else: Warning("Topic %s has no terms with scores > 0. Omitting." % (topic_idx)) return topic_model
Example #28
Source File: tfidf.py From asreview with Apache License 2.0 | 5 votes |
def __init__(self, *args, ngram_max=1, **kwargs): """Initialize tfidf class. Arguments --------- ngram_max: int Can use up to ngrams up to ngram_max. For example in the case of ngram_max=2, monograms and bigrams could be used. """ super(Tfidf, self).__init__(*args, **kwargs) self.ngram_max = ngram_max self._model = Pipeline([ ('vect', CountVectorizer(ngram_range=(1, ngram_max))), ('tfidf', TfidfTransformer())] )
Example #29
Source File: create_data.py From active-learning with Apache License 2.0 | 5 votes |
def get_wikipedia_talk_data(): """Get wikipedia talk dataset. See here for more information about the dataset: https://figshare.com/articles/Wikipedia_Detox_Data/4054689 Downloads annotated comments and annotations. """ ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634' ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637' def download_file(url): req = urllib2.Request(url) response = urllib2.urlopen(req) return response # Process comments comments = pd.read_table( download_file(ANNOTATED_COMMENTS_URL), index_col=0, sep='\t') # remove newline and tab tokens comments['comment'] = comments['comment'].apply( lambda x: x.replace('NEWLINE_TOKEN', ' ')) comments['comment'] = comments['comment'].apply( lambda x: x.replace('TAB_TOKEN', ' ')) # Process labels annotations = pd.read_table(download_file(ANNOTATIONS_URL), sep='\t') # labels a comment as an atack if the majority of annoatators did so labels = annotations.groupby('rev_id')['attack'].mean() > 0.5 # Perform data preprocessing, should probably tune these hyperparameters vect = CountVectorizer(max_features=30000, ngram_range=(1, 2)) tfidf = TfidfTransformer(norm='l2') X = tfidf.fit_transform(vect.fit_transform(comments['comment'])) y = np.array(labels) data = Dataset(X, y) return data
Example #30
Source File: TermDocMatrix.py From scattertext with Apache License 2.0 | 5 votes |
def _fit_tfidf_model(self, category, clf): y = self._get_mask_from_category(category) y_continuous = self._get_continuous_version_boolean_y(y) X = TfidfTransformer().fit_transform(self._X) clf.fit(X, y_continuous)