Python Examples of sklearn.decomposition.LatentDirichletAllocation

Source File: test_decomposition.py From pandas-ml with BSD 3-Clause "New" or "Revised" License

7 votes

def test_objectmapper(self):
        df = pdml.ModelFrame([])
        self.assertIs(df.decomposition.PCA, decomposition.PCA)
        self.assertIs(df.decomposition.IncrementalPCA,
                      decomposition.IncrementalPCA)
        self.assertIs(df.decomposition.KernelPCA, decomposition.KernelPCA)
        self.assertIs(df.decomposition.FactorAnalysis,
                      decomposition.FactorAnalysis)
        self.assertIs(df.decomposition.FastICA, decomposition.FastICA)
        self.assertIs(df.decomposition.TruncatedSVD, decomposition.TruncatedSVD)
        self.assertIs(df.decomposition.NMF, decomposition.NMF)
        self.assertIs(df.decomposition.SparsePCA, decomposition.SparsePCA)
        self.assertIs(df.decomposition.MiniBatchSparsePCA,
                      decomposition.MiniBatchSparsePCA)
        self.assertIs(df.decomposition.SparseCoder, decomposition.SparseCoder)
        self.assertIs(df.decomposition.DictionaryLearning,
                      decomposition.DictionaryLearning)
        self.assertIs(df.decomposition.MiniBatchDictionaryLearning,
                      decomposition.MiniBatchDictionaryLearning)

        self.assertIs(df.decomposition.LatentDirichletAllocation,
                      decomposition.LatentDirichletAllocation)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

6 votes

def check_verbosity(verbose, evaluate_every, expected_lines,
                    expected_perplexities):
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=3,
                                    learning_method='batch',
                                    verbose=verbose,
                                    evaluate_every=evaluate_every,
                                    random_state=0)
    out = StringIO()
    old_out, sys.stdout = sys.stdout, out
    try:
        lda.fit(X)
    finally:
        sys.stdout = old_out

    n_lines = out.getvalue().count('\n')
    n_perplexity = out.getvalue().count('perplexity')
    assert_equal(expected_lines, n_lines)
    assert_equal(expected_perplexities, n_perplexity)

Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_lda_fit_perplexity():
    # Test that the perplexity computed during fit is consistent with what is
    # returned by the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch', random_state=0,
                                    evaluate_every=1)
    lda.fit(X)

    # Perplexity computed at end of fit method
    perplexity1 = lda.bound_

    # Result of perplexity method on the train set
    perplexity2 = lda.perplexity(X)

    assert_almost_equal(perplexity1, perplexity2)

Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_lda_score(method):
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_components, X = _build_sparse_mtx()
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=1, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_1.fit_transform(X)
    score_1 = lda_1.score(X)

    lda_2.fit_transform(X)
    score_2 = lda_2.score(X)
    assert_greater_equal(score_2, score_1)

Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_lda_perplexity(method):
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_components, X = _build_sparse_mtx()
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=1, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      max_iter=10, learning_method=method,
                                      total_samples=100, random_state=0)
    lda_1.fit(X)
    perp_1 = lda_1.perplexity(X, sub_sampling=False)

    lda_2.fit(X)
    perp_2 = lda_2.perplexity(X, sub_sampling=False)
    assert_greater_equal(perp_1, perp_2)

    perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
    perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
    assert_greater_equal(perp_1_subsampling, perp_2_subsampling)

Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_lda_preplexity_mismatch():
    # test dimension mismatch in `perplexity` method
    rng = np.random.RandomState(0)
    n_components = rng.randint(3, 6)
    n_samples = rng.randint(6, 10)
    X = np.random.randint(4, size=(n_samples, 10))
    lda = LatentDirichletAllocation(n_components=n_components,
                                    learning_offset=5., total_samples=20,
                                    random_state=rng)
    lda.fit(X)
    # invalid samples
    invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
    assert_raises_regexp(ValueError, r'Number of samples',
                         lda._perplexity_precomp_distr, X, invalid_n_samples)
    # invalid topic number
    invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
    assert_raises_regexp(ValueError, r'Number of topics',
                         lda._perplexity_precomp_distr, X,
                         invalid_n_components)

Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def check_verbosity(verbose, evaluate_every, expected_lines,
                    expected_perplexities):
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=3,
                                    learning_method='batch',
                                    verbose=verbose,
                                    evaluate_every=evaluate_every,
                                    random_state=0)
    out = StringIO()
    old_out, sys.stdout = sys.stdout, out
    try:
        lda.fit(X)
    finally:
        sys.stdout = old_out

    n_lines = out.getvalue().count('\n')
    n_perplexity = out.getvalue().count('perplexity')
    assert_equal(expected_lines, n_lines)
    assert_equal(expected_perplexities, n_perplexity)

Source File: topic.py From Python-DevOps with MIT License

6 votes

def train_lda(corpus, n_topics=10, max_df=0.95, min_df=2,
              cleaning=clearstring, stop_words='english'):
    if cleaning is not None:
        for i in range(len(corpus)):
            corpus[i] = cleaning(corpus[i])
    tf_vectorizer = CountVectorizer(
        max_df=max_df,
        min_df=min_df,
        stop_words=stop_words)
    tf = tf_vectorizer.fit_transform(corpus)
    tf_features = tf_vectorizer.get_feature_names()
    lda = LatentDirichletAllocation(
        n_topics=n_topics,
        max_iter=5,
        learning_method='online',
        learning_offset=50.,
        random_state=0).fit(tf)
    return TOPIC(tf_features, lda)

Source File: topics.py From atap with Apache License 2.0

6 votes

def __init__(self, n_topics=50, estimator='LDA'):
        """
        n_topics is the desired number of topics
        To use Latent Semantic Analysis, set estimator to 'LSA',
        To use Non-Negative Matrix Factorization, set estimator to 'NMF',
        otherwise, defaults to Latent Dirichlet Allocation ('LDA').
        """
        self.n_topics = n_topics

        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)

        self.model = Pipeline([
            ('norm', TextNormalizer()),
            ('tfidf', CountVectorizer(tokenizer=identity,
                                      preprocessor=None, lowercase=False)),
            ('model', self.estimator)
        ])

Source File: LDA_Analysis.py From Spider with MIT License

6 votes

def word2vec(word_list,n_features=1000,topics = 5):
    tf_vectorizer = CountVectorizer(strip_accents='unicode',
                                    max_features=n_features,
                                    #stop_words='english',
                                    max_df=0.5,
                                    min_df=10)
    tf = tf_vectorizer.fit_transform(word_list)

    lda = LatentDirichletAllocation(n_components=topics,#主题数
                                    learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好，这样可以少很多参数要调
                                    )
    #用变分贝叶斯方法训练模型
    lda.fit(tf)

    #依次输出每个主题的关键词表
    tf_feature_names = tf_vectorizer.get_feature_names()

    return lda,tf,tf_feature_names,tf_vectorizer

#将主题以可视化结果展现出来

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

6 votes

def test_lda_preplexity_mismatch():
    # test dimension mismatch in `perplexity` method
    rng = np.random.RandomState(0)
    n_components = rng.randint(3, 6)
    n_samples = rng.randint(6, 10)
    X = np.random.randint(4, size=(n_samples, 10))
    lda = LatentDirichletAllocation(n_components=n_components,
                                    learning_offset=5., total_samples=20,
                                    random_state=rng)
    lda.fit(X)
    # invalid samples
    invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components))
    assert_raises_regexp(ValueError, r'Number of samples',
                         lda._perplexity_precomp_distr, X, invalid_n_samples)
    # invalid topic number
    invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1))
    assert_raises_regexp(ValueError, r'Number of topics',
                         lda._perplexity_precomp_distr, X,
                         invalid_n_components)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

6 votes

def test_lda_perplexity():
    # Test LDA perplexity for batch training
    # perplexity should be lower after each iteration
    n_components, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit(X)
        perp_1 = lda_1.perplexity(X, sub_sampling=False)

        lda_2.fit(X)
        perp_2 = lda_2.perplexity(X, sub_sampling=False)
        assert_greater_equal(perp_1, perp_2)

        perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True)
        perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True)
        assert_greater_equal(perp_1_subsampling, perp_2_subsampling)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

6 votes

def test_lda_score():
    # Test LDA score for batch training
    # score should be higher after each iteration
    n_components, X = _build_sparse_mtx()
    for method in ('online', 'batch'):
        lda_1 = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=1, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_2 = LatentDirichletAllocation(n_components=n_components,
                                          max_iter=10, learning_method=method,
                                          total_samples=100, random_state=0)
        lda_1.fit_transform(X)
        score_1 = lda_1.score(X)

        lda_2.fit_transform(X)
        score_2 = lda_2.score(X)
        assert_greater_equal(score_2, score_1)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

6 votes

def test_lda_fit_perplexity():
    # Test that the perplexity computed during fit is consistent with what is
    # returned by the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch', random_state=0,
                                    evaluate_every=1)
    lda.fit(X)

    # Perplexity computed at end of fit method
    perplexity1 = lda.bound_

    # Result of perplexity method on the train set
    perplexity2 = lda.perplexity(X)

    assert_almost_equal(perplexity1, perplexity2)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

5 votes

def test_lda_empty_docs():
    """Test LDA on empty document (all-zero rows)."""
    Z = np.zeros((5, 4))
    for X in [Z, csr_matrix(Z)]:
        lda = LatentDirichletAllocation(max_iter=750).fit(X)
        assert_almost_equal(lda.components_.sum(axis=0),
                            np.ones(lda.components_.shape[1]))

Source File: test_text2mat.py From hypertools with MIT License

5 votes

def test_LDA_class_instance():
    user_model = LatentDirichletAllocation(n_components=15)
    assert text2mat(data, semantic=user_model, corpus=data)[0].shape[1]==15

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

5 votes

def test_doc_topic_distr_deprecation():
    # Test that the appropriate warning message is displayed when a user
    # attempts to pass the doc_topic_distr argument to the perplexity method
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=1,
                                    learning_method='batch',
                                    total_samples=100, random_state=0)
    distr1 = lda.fit_transform(X)
    distr2 = None
    assert_warns(DeprecationWarning, lda.perplexity, X, distr1)
    assert_warns(DeprecationWarning, lda.perplexity, X, distr2)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

5 votes

def test_lda_negative_input():
    # test pass dense matrix with sparse negative input.
    X = -np.ones((5, 10))
    lda = LatentDirichletAllocation()
    regex = r"^Negative values in data passed"
    assert_raises_regexp(ValueError, regex, lda.fit, X)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

5 votes

def test_lda_partial_fit_multi_jobs():
    # Test LDA online training with multi CPU
    rng = np.random.RandomState(0)
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2,
                                    learning_offset=5., total_samples=30,
                                    random_state=rng)
    for i in range(2):
        lda.partial_fit(X)

    correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
    for c in lda.components_:
        top_idx = set(c.argsort()[-3:][::-1])
        assert_true(tuple(sorted(top_idx)) in correct_idx_grps)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

5 votes

def test_lda_multi_jobs():
    n_components, X = _build_sparse_mtx()
    # Test LDA batch training with multi CPU
    for method in ('online', 'batch'):
        rng = np.random.RandomState(0)
        lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2,
                                        learning_method=method,
                                        evaluate_every=1, random_state=rng)
        lda.fit(X)

        correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)]
        for c in lda.components_:
            top_idx = set(c.argsort()[-3:][::-1])
            assert_true(tuple(sorted(top_idx)) in correct_idx_grps)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

5 votes

def test_lda_no_component_error():
    # test `transform` and `perplexity` before `fit`
    rng = np.random.RandomState(0)
    X = rng.randint(4, size=(20, 10))
    lda = LatentDirichletAllocation()
    regex = r"^no 'components_' attribute"
    assert_raises_regexp(NotFittedError, regex, lda.transform, X)
    assert_raises_regexp(NotFittedError, regex, lda.perplexity, X)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

5 votes

def test_lda_score_perplexity():
    # Test the relationship between LDA score and perplexity
    n_components, X = _build_sparse_mtx()
    lda = LatentDirichletAllocation(n_components=n_components, max_iter=10,
                                    random_state=0)
    lda.fit(X)
    perplexity_1 = lda.perplexity(X, sub_sampling=False)

    score = lda.score(X)
    perplexity_2 = np.exp(-1. * (score / np.sum(X.data)))
    assert_almost_equal(perplexity_1, perplexity_2)

Source File: test_text2mat.py From hypertools with MIT License

5 votes

def test_tfidf_LDA():
    isinstance(text2mat(data, vectorizer='TfidfVectorizer',
                        semantic='LatentDirichletAllocation', corpus=data)[0], np.ndarray)

Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License

5 votes

def transformer_factory(self):
        return LatentDirichletAllocation(n_components=self.width, learning_method='online', random_state=71)

Source File: test_text2mat.py From hypertools with MIT License

5 votes

def test_text_model_params():
    assert isinstance(text2mat(data, semantic={
        'model' : 'LatentDirichletAllocation',
        'params' : {
            'learning_method' : 'batch'
            }}
        , corpus=data)[0], np.ndarray)

Source File: test_online_lda.py From twitter-stock-recommendation with MIT License

5 votes

def test_lda_default_prior_params():
    # default prior parameter should be `1 / topics`
    # and verbose params should not affect result
    n_components, X = _build_sparse_mtx()
    prior = 1. / n_components
    lda_1 = LatentDirichletAllocation(n_components=n_components,
                                      doc_topic_prior=prior,
                                      topic_word_prior=prior, random_state=0)
    lda_2 = LatentDirichletAllocation(n_components=n_components,
                                      random_state=0)
    topic_distr_1 = lda_1.fit_transform(X)
    topic_distr_2 = lda_2.fit_transform(X)
    assert_almost_equal(topic_distr_1, topic_distr_2)

Source File: test_text2mat.py From hypertools with MIT License

5 votes

def test_LDA_class():
    assert text2mat(data, semantic=LatentDirichletAllocation, corpus=data)[0].shape[1]==10

Source File: topic.py From Python-DevOps with MIT License

5 votes

def train_lda(corpus,n_topics=10, max_df=0.95, min_df=2,cleaning=clearstring,stop_words='english'):
    if cleaning is not None:
        for i in range(len(corpus)): corpus[i] = cleaning(corpus[i])
    tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, stop_words=stop_words)
    tf = tf_vectorizer.fit_transform(corpus)
    tf_features = tf_vectorizer.get_feature_names()
    lda = LatentDirichletAllocation(n_topics=n_topics, max_iter = 5, learning_method = 'online', learning_offset=50., random_state=0).fit(tf)
    return TOPIC(tf_features,lda)

Source File: LatentDirichletAllocation.py From mltk-algo-contrib with Apache License 2.0

5 votes

def __init__(self, options):
        self.handle_options(options)
        out_params = convert_params(
            options.get('params', {}),
            floats=['doc_topic_prior','learning_decay','learning_offset','perp_tol','mean_change_tol'],
            strs=['learning_method'],
            ints=['k','max_iter','batch_size','evaluate_every','total_samples','max_doc_update_iter','n_jobs','verbose','random_state'],
            aliases={'k': 'n_topics'}
        )

        self.estimator = _LatentDirichletAllocation(**out_params)

Source File: build_lda_model.py From altair with Apache License 2.0

5 votes

def build_lda_model(code_scripts_list,topics,vocab,use_binary=False,n_jobs=1):

    # Vectorize the python scripts with bag of words
    bow_model = CountVectorizer(analyzer="word", vocabulary=vocab, binary=use_binary)
    bow_vector_values = bow_model.transform(code_scripts_list).toarray()

    # Train/Fit LDA
    lda_model = LatentDirichletAllocation(n_topics=topics,learning_method="online",random_state=0,n_jobs=1)
    lda_model.fit(bow_vector_values)

    return lda_model

Python sklearn.decomposition.LatentDirichletAllocation() Examples