Python Examples of sklearn.feature_extraction.text.TfidfTransformer

Source File: get_topic.py From poem_generator with Apache License 2.0

7 votes

def write_topics(ftopics, fwords, ftopics_words, poem_words, n_topic, n_topic_words):
    count_matrix = count_vect.fit_transform(poem_words)
    tfidf = TfidfTransformer().fit_transform(count_matrix)
    nmf = decomposition.NMF(n_components=n_topic).fit(tfidf)
    feature_names = count_vect.get_feature_names()
    fw = codecs.open(ftopics, 'w', 'utf-8')
    for topic in nmf.components_:
        fw.write(' '.join([feature_names[i] for i in topic.argsort()[:-n_topic_words - 1:-1]]) + '\n')
    fw.close()
    print('Write topics done.')
    fw = codecs.open(fwords, 'wb')
    pickle.dump(feature_names, fw)
    fw.close()
    print('Write words done.')
    fw = codecs.open(ftopics_words, 'wb')
    pickle.dump(nmf.components_, fw)
    fw.close()
    print('Write topic_words done.')

Source File: TermDocMatrix.py From scattertext with Apache License 2.0

6 votes

def get_logistic_regression_coefs_l2(self, category,
                                         clf=RidgeClassifierCV()):
        ''' Computes l2-penalized logistic regression score.
        Parameters
        ----------
        category : str
            category name to score

        category : str
            category name to score
        Returns
        -------
            (coefficient array, accuracy, majority class baseline accuracy)
        '''
        try:
            from sklearn.cross_validation import cross_val_predict
        except:
            from sklearn.model_selection import cross_val_predict
        y = self._get_mask_from_category(category)
        X = TfidfTransformer().fit_transform(self._X)
        clf.fit(X, y)
        y_hat = cross_val_predict(clf, X, y)
        acc, baseline = self._get_accuracy_and_baseline_accuracy(y, y_hat)
        return clf.coef_[0], acc, baseline

Source File: textrank.py From nlg-yongzhuo with MIT License

6 votes

def tdidf_sim(sentences):
    """
       tfidf相似度
    :param sentences: 
    :return: 
    """
    # tfidf计算
    model = TfidfVectorizer(tokenizer=jieba.cut,
                            ngram_range=(1, 2), # 3,5
                            stop_words=[' ', '\t', '\n'],  # 停用词
                            max_features=10000,
                            token_pattern=r"(?u)\b\w+\b",  # 过滤停用词
                            min_df=1,
                            max_df=0.9,
                            use_idf=1,  # 光滑
                            smooth_idf=1,  # 光滑
                            sublinear_tf=1, )  # 光滑
    matrix = model.fit_transform(sentences)
    matrix_norm = TfidfTransformer().fit_transform(matrix)
    return matrix_norm

Source File: textrank_sklearn.py From nlg-yongzhuo with MIT License

6 votes

def tdidf_sim(sentences):
    """
       tfidf相似度
    :param sentences: 
    :return: 
    """
    # tfidf计算
    model = TfidfVectorizer(tokenizer=jieba.cut,
                            ngram_range=(1, 2), # 3,5
                            stop_words=[' ', '\t', '\n'],  # 停用词
                            max_features=10000,
                            token_pattern=r"(?u)\b\w+\b",  # 过滤停用词
                            min_df=1,
                            max_df=0.9,
                            use_idf=1,  # 光滑
                            smooth_idf=1,  # 光滑
                            sublinear_tf=1, )  # 光滑
    matrix = model.fit_transform(sentences)
    matrix_norm = TfidfTransformer().fit_transform(matrix)
    return matrix_norm

Source File: test_termDocMatrixFactory.py From scattertext with Apache License 2.0

6 votes

def test_main(self):
		categories, documents = get_docs_categories()
		clean_function = lambda text: '' if text.startswith('[') else text
		entity_types = set(['GPE'])
		term_doc_mat = (
			TermDocMatrixFactory(
				category_text_iter=zip(categories, documents),
				clean_function=clean_function,
				nlp=_testing_nlp,
				feats_from_spacy_doc=FeatsFromSpacyDoc(entity_types_to_censor=entity_types)
			).build()
		)
		clf = PassiveAggressiveClassifier()
		fdc = FeatsFromDoc(term_doc_mat._term_idx_store,
		                   clean_function=clean_function,
		                   feats_from_spacy_doc=FeatsFromSpacyDoc(
			                   entity_types_to_censor=entity_types)).set_nlp(_testing_nlp)
		tfidf = TfidfTransformer(norm='l1')
		X = tfidf.fit_transform(term_doc_mat._X)
		clf.fit(X, term_doc_mat._y)
		X_to_predict = fdc.feats_from_doc('Did sometimes march UNKNOWNWORD')
		pred = clf.predict(tfidf.transform(X_to_predict))
		dec = clf.decision_function(X_to_predict)

Source File: centroid_word_embeddings.py From text-summarizer with GNU General Public License v3.0

6 votes

def get_topic_idf(self, sentences):
        vectorizer = CountVectorizer()
        sent_word_matrix = vectorizer.fit_transform(sentences)

        transformer = TfidfTransformer(norm=None, sublinear_tf=False, smooth_idf=False)
        tfidf = transformer.fit_transform(sent_word_matrix)
        tfidf = tfidf.toarray()

        centroid_vector = tfidf.sum(0)
        centroid_vector = np.divide(centroid_vector, centroid_vector.max())
        # print(centroid_vector.max())

        feature_names = vectorizer.get_feature_names()

        relevant_vector_indices = np.where(centroid_vector > self.topic_threshold)[0]

        word_list = list(np.array(feature_names)[relevant_vector_indices])
        return word_list

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_tf_idf_smoothing():
    X = [[1, 1, 1],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert (tfidf >= 0).all()

    # check normalization
    assert_array_almost_equal((tfidf ** 2).sum(axis=1), [1., 1., 1.])

    # this is robust to features with only zeros
    X = [[1, 1, 0],
         [1, 1, 0],
         [1, 0, 0]]
    tr = TfidfTransformer(smooth_idf=True, norm='l2')
    tfidf = tr.fit_transform(X).toarray()
    assert (tfidf >= 0).all()

Source File: preprocess.py From cn-text-classifier with GNU General Public License v3.0

6 votes

def extract_characters(sentences: list, dimension: int):
    """
    vertorizer
    :param sentences: list
    :param dimension: int
    :return: weight, training_data
    """
    print("Vetorizier...")
    # Transfer into frequency matrix a[i][j], word j in text class i frequency
    vertorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.46)
    # vertorizer = CountVectorizer()
    # collect tf-idf weight
    transformer = TfidfTransformer()
    # outer transform for calculate tf-idf, second for transform into matrix
    tfidf = transformer.fit_transform(vertorizer.fit_transform(sentences))
    # get all words in BOW
    words_bag = vertorizer.get_feature_names()
    # w[i][j] represents word j's weight in text class i
    weight = tfidf.toarray()
    print('Features length:' + str(len(words_bag)))
    pca = PCA(n_components=dimension)
    training_data = pca.fit_transform(weight)
    return weight, training_data

Source File: field_types.py From lexpredict-contraxsuite with GNU Affero General Public License v3.0

6 votes

def build_vectorization_pipeline(self) -> Tuple[List[Tuple[str, Any]], Callable[[], List[str]]]:
        """
        Build SKLearn vectorization pipeline for this field.
        This is used in field-based machine learning when we calculate value of one field based on the
        values of other fields of this document.

        We are able to detect only choice fields this way at the moment.

        To reach this we need to build a feature vector of all dependencies of the field being detected.
        This feature vector is built as a union of feature vectors of each dependency.

        See how the whole pipeline is built in FieldBasedMLOnlyFieldDetectionStrategy.build_pipeline(..)

        :return: Tuple of: 1. List of vectorization steps - to be added to a Pipeline()
                           2. List of str feature names or a function returning list of str feature names.
        """

        vect = CountVectorizer(strip_accents='unicode', analyzer='word',
                               stop_words=self._build_stop_words())
        return [('clean', vectorizers.ReplaceNoneTransformer('')),
                ('vect', vect),
                ('tfidf', TfidfTransformer())], self._wrap_get_feature_names(vect)

Source File: enhance_marko.py From nlp_xiaojiang with MIT License

6 votes

def get_keyword_from_tf(sentences, p):
    """
      获取某个类型下语料的热词
    :param sentences: list, cuted sentences, filter by " "
    :param p: float, rate, 0 < p < 1
    :return: list, words
    """
    sentence_cut_list = [" ".join(list(jieba.cut(text.strip(), cut_all=False, HMM=True))) for text in sentences]
    # token_pattern指定统计词频的模式, 不指定, 默认如英文, 不统计单字
    vectorizer = CountVectorizer(token_pattern='\\b\\w+\\b')
    # norm=None对词频结果不归一化
    # use_idf=False, 因为使用的是计算tfidf的函数, 所以要忽略idf的计算
    transformer = TfidfTransformer(norm=None, use_idf=False)
    vectorizer.fit_transform(sentence_cut_list)
    # tf = transformer.fit_transform(vectorizer.fit_transform(sentence_cut_list))
    word = vectorizer.get_feature_names()
    # weight = tf.toarray()
    return word[-int(len(word) * p):]

Source File: utils.py From GGP with Apache License 2.0

6 votes

def load_data_ssl(data_name):
    adj_csr, features, y_train, y_val, y_test, train_mask, val_mask, test_mask = load_data(data_name)
    adj_mat = np.asarray(adj_csr.toarray(), dtype=np_float_type)
    x_tr = np.reshape(np.arange(len(train_mask))[train_mask], (-1, 1))
    x_val = np.reshape(np.arange(len(val_mask))[val_mask], (-1, 1))
    x_test = np.reshape(np.arange(len(test_mask))[test_mask], (-1, 1))
    y_tr = np.asarray(y_train[train_mask], dtype=np.int32)
    y_tr = np.reshape(np.sum(np.tile(np.arange(y_tr.shape[1]), (np.sum(train_mask), 1)) * y_tr, axis=1), (-1, 1))
    y_val = np.asarray(y_val[val_mask], dtype=np.int32)
    y_val = np.reshape(np.sum(np.tile(np.arange(y_val.shape[1]), (np.sum(val_mask), 1)) * y_val, axis=1), (-1, 1))
    y_test = np.asarray(y_test[test_mask], dtype=np.int32)
    y_test = np.reshape(np.sum(np.tile(np.arange(y_test.shape[1]), (np.sum(test_mask), 1)) * y_test, axis=1), (-1, 1))
    node_features = features.toarray()
    if data_name.lower() != 'pubmed': #pubmed already comes with tf-idf
        transformer = TfidfTransformer(smooth_idf=True)
        node_features = transformer.fit_transform(node_features).toarray()
    return adj_mat, node_features, x_tr, y_tr, x_val, y_val, x_test, y_test

Source File: test_explanation_model.py From cxplain with MIT License

6 votes

def test_nlp_not_padded_invalid(self):
        num_words = 1024
        (x_train, y_train), (_, _) = TestUtil.get_random_variable_length_dataset(max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        with self.assertRaises(ValueError):
            explainer.fit(x_train, y_train)

Source File: datasets.py From ann-benchmarks with MIT License

6 votes

def transform_bag_of_words(filename, n_dimensions, out_fn):
    import gzip
    from scipy.sparse import lil_matrix
    from sklearn.feature_extraction.text import TfidfTransformer
    from sklearn import random_projection
    with gzip.open(filename, 'rb') as f:
        file_content = f.readlines()
        entries = int(file_content[0])
        words = int(file_content[1])
        file_content = file_content[3:]  # strip first three entries
        print("building matrix...")
        A = lil_matrix((entries, words))
        for e in file_content:
            doc, word, cnt = [int(v) for v in e.strip().split()]
            A[doc - 1, word - 1] = cnt
        print("normalizing matrix entries with tfidf...")
        B = TfidfTransformer().fit_transform(A)
        print("reducing dimensionality...")
        C = random_projection.GaussianRandomProjection(
            n_components=n_dimensions).fit_transform(B)
        X_train, X_test = train_test_split(C)
        write_output(numpy.array(X_train), numpy.array(
            X_test), out_fn, 'angular')

Source File: relevance.py From PointerSQL with MIT License

6 votes

def question_classifier(data):
  questions = [i[1] for i in data]
  sql_type = [i[2].split(' ')[1] for i in data]
  sql_type_set = set(sql_type)
  sql_classes = dict([(type, i) for i, type in enumerate(sql_type_set)])

  target = np.array([sql_classes[i] for i in sql_type])
  sql_type_to_indices = {}
  for type in sql_type_set:
    sql_type_to_indices[type] = [idx for idx, i in enumerate(sql_type) if i == type]

  # Build classifier
  # TODO better ones 
  text_clf = Pipeline([('vect', CountVectorizer()),
                       ('tfidf', TfidfTransformer()),
                       ('clf', svm.LinearSVC())])
  text_clf.fit(questions, target)
  predicted = text_clf.predict(questions)
  print('Training Acc.: %f' %(np.mean(predicted == target)))

  return sql_type_to_indices, text_clf

Source File: WGGraph.py From AbTextSumm with Mozilla Public License 2.0

6 votes

def removeSimilarSentences(generatedSentences, originalSentences,  stopwords,threshold=0.80,):
    docs=[]
    for sent, sim in generatedSentences:
        docs.append(sent)
    docs.extend(originalSentences)
    
    bow_matrix = StemmedTfidfVectorizer(stop_words=stopwords).fit_transform(docs)
    normalized = TfidfTransformer().fit_transform(bow_matrix)
    #simMatrix = (normalized[0:] * normalized[0:].T).A
    simindices=[]
    #print 'Num original, ', len(originalSentences)
    for i in xrange(len(generatedSentences)):
        simGeneratedScores = linear_kernel(normalized[i], normalized[len(generatedSentences):]).flatten()
        if(max(simGeneratedScores) >= threshold):
            simindices.append(i)
    
    #print simindices
    finalGen=[sentence for k,sentence in enumerate(generatedSentences) if k not in simindices]
    #print len(generatedSentences), len(finalGen)
    return finalGen

Source File: bow.py From broca with MIT License

6 votes

def __init__(self, min_df=1, max_df=0.9, tokenizer=LemmaTokenizer, hash=False):
        """
        `min_df` is set to filter out extremely rare words,
        since we don't want those to dominate the distance metric.

        `max_df` is set to filter out extremely common words,
        since they don't convey much information.
        """

        # Wrap the specified tokenizer
        t = Tokenizer(tokenizer())

        if hash:
            vectr = HashingVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t)
        else:
            vectr = CountVectorizer(input='content', stop_words='english', lowercase=True, tokenizer=t, min_df=min_df, max_df=max_df)

        args = [
            ('vectorizer', vectr),
            ('tfidf', TfidfTransformer(norm=None, use_idf=True, smooth_idf=True)),
            ('normalizer', Normalizer(copy=False))
        ]

        self.pipeline = Pipeline(args)
        self.trained = False

Source File: classifiers.py From fine-grained-sentiment with MIT License

6 votes

def __init__(self, model_file: str=None) -> None:
        super().__init__()
        # pip install sklearn
        from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
        from sklearn.linear_model import LogisticRegression
        from sklearn.pipeline import Pipeline
        self.pipeline = Pipeline(
            [
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', LogisticRegression(
                    solver='newton-cg',
                    multi_class='multinomial',
                    random_state=42,
                    max_iter=100,
                )),
            ]
        )

Source File: classifiers.py From fine-grained-sentiment with MIT License

6 votes

def __init__(self, model_file: str=None) -> None:
        super().__init__()
        # pip install sklearn
        from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
        from sklearn.linear_model import SGDClassifier
        from sklearn.pipeline import Pipeline
        self.pipeline = Pipeline(
            [
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(
                    loss='hinge',
                    penalty='l2',
                    alpha=1e-3,
                    random_state=42,
                    max_iter=100,
                    learning_rate='optimal',
                    tol=None,
                )),
            ]
        )

Source File: explainer.py From fine-grained-sentiment with MIT License

6 votes

def train(self) -> sklearn.pipeline.Pipeline:
        "Create sklearn logistic regression model pipeline"
        from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
        from sklearn.linear_model import SGDClassifier
        from sklearn.pipeline import Pipeline
        pipeline = Pipeline(
            [
                ('vect', CountVectorizer()),
                ('tfidf', TfidfTransformer()),
                ('clf', SGDClassifier(
                    loss='modified_huber',
                    penalty='l2',
                    alpha=1e-3,
                    random_state=42,
                    max_iter=100,
                    tol=None,
                )),
            ]
        )
        # Train model
        classifier = pipeline.fit(self.train_df['text'], self.train_df['truth'])
        return classifier

Source File: svmclassifier.py From SQG with GNU General Public License v3.0

5 votes

def __init__(self, model_file_path=None):
        super(SVMClassifier, self).__init__(model_file_path)
        self.pipeline = Pipeline([('vect', CountVectorizer()), ('tf-idf', TfidfTransformer()),
                                  ('svm',
                                   SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42))])
        self.parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tf-idf__use_idf': (True, False),
                           'svm__alpha': (1e-2, 1e-3)}

Source File: test_explanation_model.py From cxplain with MIT License

5 votes

def test_imdb_padded_valid(self):
        num_samples = 32
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_imdb(word_dictionary_size=num_words,
                                                                 num_subsamples=num_samples)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)

Source File: test_explanation_model.py From cxplain with MIT License

5 votes

def test_nlp_padded_valid(self):
        num_words = 1024
        (x_train, y_train), (x_test, y_test) = TestUtil.get_random_variable_length_dataset(max_value=num_words)

        explained_model = RandomForestClassifier(n_estimators=64, max_depth=5, random_state=1)

        counter = CountVectoriser(num_words)
        tfidf_transformer = TfidfTransformer()

        explained_model = Pipeline([('counts', counter),
                                    ('tfidf', tfidf_transformer),
                                    ('model', explained_model)])
        explained_model.fit(x_train, y_train)

        model_builder = RNNModelBuilder(embedding_size=num_words, with_embedding=True,
                                        num_layers=2, num_units=32, activation="relu", p_dropout=0.2, verbose=0,
                                        batch_size=32, learning_rate=0.001, num_epochs=2, early_stopping_patience=128)
        masking_operation = WordDropMasking()
        loss = binary_crossentropy
        explainer = CXPlain(explained_model, model_builder, masking_operation, loss)

        x_train = pad_sequences(x_train, padding="post", truncating="post", dtype=int)
        x_test = pad_sequences(x_test, padding="post", truncating="post", dtype=int, maxlen=x_train.shape[1])

        explainer.fit(x_train, y_train)
        eval_score = explainer.score(x_test, y_test)
        train_score = explainer.get_last_fit_score()
        median = explainer.predict(x_test)
        self.assertTrue(median.shape == x_test.shape)

Source File: test_text.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_countvectorizer_custom_vocabulary_pipeline():
    what_we_like = ["pizza", "beer"]
    pipe = Pipeline([
        ('count', CountVectorizer(vocabulary=what_we_like)),
        ('tfidf', TfidfTransformer())])
    X = pipe.fit_transform(ALL_FOOD_DOCS)
    assert_equal(set(pipe.named_steps['count'].vocabulary_),
                 set(what_we_like))
    assert_equal(X.shape[1], len(what_we_like))

Source File: svm.py From opentc with MIT License

5 votes

def fit(self, dataset, filename):
        self.logger.debug("fit")
        self.clf = Pipeline([('vect', CountVectorizer()),
                             ('tfidf', TfidfTransformer()),
                             ('clf', SGDClassifier(loss='log', penalty='l2', alpha=1e-3, n_iter=5, random_state=42)),
                             ])
        self.clf.fit(dataset.get_dataset()['data'], dataset.get_dataset()['target'])
        joblib.dump(self.clf, filename + ".pkl", compress=9)

Source File: naivebayesclassifier.py From SQG with GNU General Public License v3.0

5 votes

def __init__(self, model_file_path=None):
        super(NaiveBayesClassifier, self).__init__(model_file_path)
        self.pipeline = Pipeline(
            [('vect', CountVectorizer()), ('tf-idf', TfidfTransformer()), ('naive-bayes', MultinomialNB())])
        self.parameters = {'vect__ngram_range': [(1, 1), (1, 2)], 'tf-idf__use_idf': (True, False),
                           'naive-bayes__alpha': (1e-2, 1e-3)}

Source File: _preprocessor.py From sumpy with Apache License 2.0

5 votes

def build(self):
        if not hasattr(self, "_tfidf_transformer"):
            self._tfidf_transformer = None

        if self._tfidf_transformer is None:
            self._tfidf_transformer = TfidfTransformer()
                    #input=u"content", preprocessor=lambda x: x,
                    #tokenizer=lambda x: x)

Source File: SentencesForTopicModeling.py From scattertext with Apache License 2.0

5 votes

def get_topics_from_model(
			self,
			pipe=Pipeline([
				('tfidf', TfidfTransformer(sublinear_tf=True)),
				('nmf', (NMF(n_components=30, alpha=.1, l1_ratio=.5, random_state=0)))]),
			num_terms_per_topic=10):
		'''

		Parameters
		----------
		pipe : Pipeline
			For example, `Pipeline([
				('tfidf', TfidfTransformer(sublinear_tf=True)),
				('nmf', (NMF(n_components=30, alpha=.1, l1_ratio=.5, random_state=0)))])`
			The last transformer must populate a `components_` attribute when finished.
		num_terms_per_topic : int

		Returns
		-------
		dict: {term: [term1, ...], ...}
		'''
		pipe.fit_transform(self.sentX)

		topic_model = {}
		for topic_idx, topic in enumerate(pipe._final_estimator.components_):
			term_list = [self.termidxstore.getval(i)
			             for i
			             in topic.argsort()[:-num_terms_per_topic - 1:-1]
			             if topic[i] > 0]
			if len(term_list) > 0:
				topic_model['%s. %s' % (topic_idx, term_list[0])] = term_list
			else:
				Warning("Topic %s has no terms with scores > 0. Omitting." % (topic_idx))
		return topic_model

Source File: tfidf.py From asreview with Apache License 2.0

5 votes

def __init__(self, *args, ngram_max=1, **kwargs):
        """Initialize tfidf class.

        Arguments
        ---------
        ngram_max: int
            Can use up to ngrams up to ngram_max. For example in the case of
            ngram_max=2, monograms and bigrams could be used.
        """
        super(Tfidf, self).__init__(*args, **kwargs)
        self.ngram_max = ngram_max
        self._model = Pipeline([
            ('vect', CountVectorizer(ngram_range=(1, ngram_max))),
            ('tfidf', TfidfTransformer())]
        )

Source File: create_data.py From active-learning with Apache License 2.0

5 votes

def get_wikipedia_talk_data():
  """Get wikipedia talk dataset.

  See here for more information about the dataset:
  https://figshare.com/articles/Wikipedia_Detox_Data/4054689
  Downloads annotated comments and annotations.
  """

  ANNOTATED_COMMENTS_URL = 'https://ndownloader.figshare.com/files/7554634'
  ANNOTATIONS_URL = 'https://ndownloader.figshare.com/files/7554637'

  def download_file(url):
    req = urllib2.Request(url)
    response = urllib2.urlopen(req)
    return response

  # Process comments
  comments = pd.read_table(
      download_file(ANNOTATED_COMMENTS_URL), index_col=0, sep='\t')
  # remove newline and tab tokens
  comments['comment'] = comments['comment'].apply(
      lambda x: x.replace('NEWLINE_TOKEN', ' '))
  comments['comment'] = comments['comment'].apply(
      lambda x: x.replace('TAB_TOKEN', ' '))

  # Process labels
  annotations = pd.read_table(download_file(ANNOTATIONS_URL), sep='\t')
  # labels a comment as an atack if the majority of annoatators did so
  labels = annotations.groupby('rev_id')['attack'].mean() > 0.5

  # Perform data preprocessing, should probably tune these hyperparameters
  vect = CountVectorizer(max_features=30000, ngram_range=(1, 2))
  tfidf = TfidfTransformer(norm='l2')
  X = tfidf.fit_transform(vect.fit_transform(comments['comment']))
  y = np.array(labels)
  data = Dataset(X, y)
  return data

Source File: TermDocMatrix.py From scattertext with Apache License 2.0

5 votes

def _fit_tfidf_model(self, category, clf):
        y = self._get_mask_from_category(category)
        y_continuous = self._get_continuous_version_boolean_y(y)
        X = TfidfTransformer().fit_transform(self._X)
        clf.fit(X, y_continuous)

Python sklearn.feature_extraction.text.TfidfTransformer() Examples