Python sklearn.decomposition.LatentDirichletAllocation() Examples
The following are 30
code examples of sklearn.decomposition.LatentDirichletAllocation().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.decomposition
, or try the search function
.
Example #1
Source File: test_decomposition.py From pandas-ml with BSD 3-Clause "New" or "Revised" License | 7 votes |
def test_objectmapper(self): df = pdml.ModelFrame([]) self.assertIs(df.decomposition.PCA, decomposition.PCA) self.assertIs(df.decomposition.IncrementalPCA, decomposition.IncrementalPCA) self.assertIs(df.decomposition.KernelPCA, decomposition.KernelPCA) self.assertIs(df.decomposition.FactorAnalysis, decomposition.FactorAnalysis) self.assertIs(df.decomposition.FastICA, decomposition.FastICA) self.assertIs(df.decomposition.TruncatedSVD, decomposition.TruncatedSVD) self.assertIs(df.decomposition.NMF, decomposition.NMF) self.assertIs(df.decomposition.SparsePCA, decomposition.SparsePCA) self.assertIs(df.decomposition.MiniBatchSparsePCA, decomposition.MiniBatchSparsePCA) self.assertIs(df.decomposition.SparseCoder, decomposition.SparseCoder) self.assertIs(df.decomposition.DictionaryLearning, decomposition.DictionaryLearning) self.assertIs(df.decomposition.MiniBatchDictionaryLearning, decomposition.MiniBatchDictionaryLearning) self.assertIs(df.decomposition.LatentDirichletAllocation, decomposition.LatentDirichletAllocation)
Example #2
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 6 votes |
def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities): n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=3, learning_method='batch', verbose=verbose, evaluate_every=evaluate_every, random_state=0) out = StringIO() old_out, sys.stdout = sys.stdout, out try: lda.fit(X) finally: sys.stdout = old_out n_lines = out.getvalue().count('\n') n_perplexity = out.getvalue().count('perplexity') assert_equal(expected_lines, n_lines) assert_equal(expected_perplexities, n_perplexity)
Example #3
Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_lda_fit_perplexity(): # Test that the perplexity computed during fit is consistent with what is # returned by the perplexity method n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', random_state=0, evaluate_every=1) lda.fit(X) # Perplexity computed at end of fit method perplexity1 = lda.bound_ # Result of perplexity method on the train set perplexity2 = lda.perplexity(X) assert_almost_equal(perplexity1, perplexity2)
Example #4
Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_lda_score(method): # Test LDA score for batch training # score should be higher after each iteration n_components, X = _build_sparse_mtx() lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert_greater_equal(score_2, score_1)
Example #5
Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_lda_perplexity(method): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_components, X = _build_sparse_mtx() lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit(X) perp_1 = lda_1.perplexity(X, sub_sampling=False) lda_2.fit(X) perp_2 = lda_2.perplexity(X, sub_sampling=False) assert_greater_equal(perp_1, perp_2) perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
Example #6
Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) n_components = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) lda = LatentDirichletAllocation(n_components=n_components, learning_offset=5., total_samples=20, random_state=rng) lda.fit(X) # invalid samples invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components)) assert_raises_regexp(ValueError, r'Number of samples', lda._perplexity_precomp_distr, X, invalid_n_samples) # invalid topic number invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1)) assert_raises_regexp(ValueError, r'Number of topics', lda._perplexity_precomp_distr, X, invalid_n_components)
Example #7
Source File: test_online_lda.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def check_verbosity(verbose, evaluate_every, expected_lines, expected_perplexities): n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=3, learning_method='batch', verbose=verbose, evaluate_every=evaluate_every, random_state=0) out = StringIO() old_out, sys.stdout = sys.stdout, out try: lda.fit(X) finally: sys.stdout = old_out n_lines = out.getvalue().count('\n') n_perplexity = out.getvalue().count('perplexity') assert_equal(expected_lines, n_lines) assert_equal(expected_perplexities, n_perplexity)
Example #8
Source File: topic.py From Python-DevOps with MIT License | 6 votes |
def train_lda(corpus, n_topics=10, max_df=0.95, min_df=2, cleaning=clearstring, stop_words='english'): if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) tf_vectorizer = CountVectorizer( max_df=max_df, min_df=min_df, stop_words=stop_words) tf = tf_vectorizer.fit_transform(corpus) tf_features = tf_vectorizer.get_feature_names() lda = LatentDirichletAllocation( n_topics=n_topics, max_iter=5, learning_method='online', learning_offset=50., random_state=0).fit(tf) return TOPIC(tf_features, lda)
Example #9
Source File: topics.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, n_topics=50, estimator='LDA'): """ n_topics is the desired number of topics To use Latent Semantic Analysis, set estimator to 'LSA', To use Non-Negative Matrix Factorization, set estimator to 'NMF', otherwise, defaults to Latent Dirichlet Allocation ('LDA'). """ self.n_topics = n_topics if estimator == 'LSA': self.estimator = TruncatedSVD(n_components=self.n_topics) elif estimator == 'NMF': self.estimator = NMF(n_components=self.n_topics) else: self.estimator = LatentDirichletAllocation(n_topics=self.n_topics) self.model = Pipeline([ ('norm', TextNormalizer()), ('tfidf', CountVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('model', self.estimator) ])
Example #10
Source File: LDA_Analysis.py From Spider with MIT License | 6 votes |
def word2vec(word_list,n_features=1000,topics = 5): tf_vectorizer = CountVectorizer(strip_accents='unicode', max_features=n_features, #stop_words='english', max_df=0.5, min_df=10) tf = tf_vectorizer.fit_transform(word_list) lda = LatentDirichletAllocation(n_components=topics,#主题数 learning_method='batch',#样本量不大只是用来学习的话用"batch"比较好,这样可以少很多参数要调 ) #用变分贝叶斯方法训练模型 lda.fit(tf) #依次输出每个主题的关键词表 tf_feature_names = tf_vectorizer.get_feature_names() return lda,tf,tf_feature_names,tf_vectorizer #将主题以可视化结果展现出来
Example #11
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_lda_preplexity_mismatch(): # test dimension mismatch in `perplexity` method rng = np.random.RandomState(0) n_components = rng.randint(3, 6) n_samples = rng.randint(6, 10) X = np.random.randint(4, size=(n_samples, 10)) lda = LatentDirichletAllocation(n_components=n_components, learning_offset=5., total_samples=20, random_state=rng) lda.fit(X) # invalid samples invalid_n_samples = rng.randint(4, size=(n_samples + 1, n_components)) assert_raises_regexp(ValueError, r'Number of samples', lda._perplexity_precomp_distr, X, invalid_n_samples) # invalid topic number invalid_n_components = rng.randint(4, size=(n_samples, n_components + 1)) assert_raises_regexp(ValueError, r'Number of topics', lda._perplexity_precomp_distr, X, invalid_n_components)
Example #12
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_lda_perplexity(): # Test LDA perplexity for batch training # perplexity should be lower after each iteration n_components, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit(X) perp_1 = lda_1.perplexity(X, sub_sampling=False) lda_2.fit(X) perp_2 = lda_2.perplexity(X, sub_sampling=False) assert_greater_equal(perp_1, perp_2) perp_1_subsampling = lda_1.perplexity(X, sub_sampling=True) perp_2_subsampling = lda_2.perplexity(X, sub_sampling=True) assert_greater_equal(perp_1_subsampling, perp_2_subsampling)
Example #13
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_lda_score(): # Test LDA score for batch training # score should be higher after each iteration n_components, X = _build_sparse_mtx() for method in ('online', 'batch'): lda_1 = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method=method, total_samples=100, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, max_iter=10, learning_method=method, total_samples=100, random_state=0) lda_1.fit_transform(X) score_1 = lda_1.score(X) lda_2.fit_transform(X) score_2 = lda_2.score(X) assert_greater_equal(score_2, score_1)
Example #14
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_lda_fit_perplexity(): # Test that the perplexity computed during fit is consistent with what is # returned by the perplexity method n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', random_state=0, evaluate_every=1) lda.fit(X) # Perplexity computed at end of fit method perplexity1 = lda.bound_ # Result of perplexity method on the train set perplexity2 = lda.perplexity(X) assert_almost_equal(perplexity1, perplexity2)
Example #15
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_lda_empty_docs(): """Test LDA on empty document (all-zero rows).""" Z = np.zeros((5, 4)) for X in [Z, csr_matrix(Z)]: lda = LatentDirichletAllocation(max_iter=750).fit(X) assert_almost_equal(lda.components_.sum(axis=0), np.ones(lda.components_.shape[1]))
Example #16
Source File: test_text2mat.py From hypertools with MIT License | 5 votes |
def test_LDA_class_instance(): user_model = LatentDirichletAllocation(n_components=15) assert text2mat(data, semantic=user_model, corpus=data)[0].shape[1]==15
Example #17
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_doc_topic_distr_deprecation(): # Test that the appropriate warning message is displayed when a user # attempts to pass the doc_topic_distr argument to the perplexity method n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=1, learning_method='batch', total_samples=100, random_state=0) distr1 = lda.fit_transform(X) distr2 = None assert_warns(DeprecationWarning, lda.perplexity, X, distr1) assert_warns(DeprecationWarning, lda.perplexity, X, distr2)
Example #18
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_lda_negative_input(): # test pass dense matrix with sparse negative input. X = -np.ones((5, 10)) lda = LatentDirichletAllocation() regex = r"^Negative values in data passed" assert_raises_regexp(ValueError, regex, lda.fit, X)
Example #19
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_lda_partial_fit_multi_jobs(): # Test LDA online training with multi CPU rng = np.random.RandomState(0) n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_offset=5., total_samples=30, random_state=rng) for i in range(2): lda.partial_fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Example #20
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_lda_multi_jobs(): n_components, X = _build_sparse_mtx() # Test LDA batch training with multi CPU for method in ('online', 'batch'): rng = np.random.RandomState(0) lda = LatentDirichletAllocation(n_components=n_components, n_jobs=2, learning_method=method, evaluate_every=1, random_state=rng) lda.fit(X) correct_idx_grps = [(0, 1, 2), (3, 4, 5), (6, 7, 8)] for c in lda.components_: top_idx = set(c.argsort()[-3:][::-1]) assert_true(tuple(sorted(top_idx)) in correct_idx_grps)
Example #21
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_lda_no_component_error(): # test `transform` and `perplexity` before `fit` rng = np.random.RandomState(0) X = rng.randint(4, size=(20, 10)) lda = LatentDirichletAllocation() regex = r"^no 'components_' attribute" assert_raises_regexp(NotFittedError, regex, lda.transform, X) assert_raises_regexp(NotFittedError, regex, lda.perplexity, X)
Example #22
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_lda_score_perplexity(): # Test the relationship between LDA score and perplexity n_components, X = _build_sparse_mtx() lda = LatentDirichletAllocation(n_components=n_components, max_iter=10, random_state=0) lda.fit(X) perplexity_1 = lda.perplexity(X, sub_sampling=False) score = lda.score(X) perplexity_2 = np.exp(-1. * (score / np.sum(X.data))) assert_almost_equal(perplexity_1, perplexity_2)
Example #23
Source File: test_text2mat.py From hypertools with MIT License | 5 votes |
def test_tfidf_LDA(): isinstance(text2mat(data, vectorizer='TfidfVectorizer', semantic='LatentDirichletAllocation', corpus=data)[0], np.ndarray)
Example #24
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def transformer_factory(self): return LatentDirichletAllocation(n_components=self.width, learning_method='online', random_state=71)
Example #25
Source File: test_text2mat.py From hypertools with MIT License | 5 votes |
def test_text_model_params(): assert isinstance(text2mat(data, semantic={ 'model' : 'LatentDirichletAllocation', 'params' : { 'learning_method' : 'batch' }} , corpus=data)[0], np.ndarray)
Example #26
Source File: test_online_lda.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_lda_default_prior_params(): # default prior parameter should be `1 / topics` # and verbose params should not affect result n_components, X = _build_sparse_mtx() prior = 1. / n_components lda_1 = LatentDirichletAllocation(n_components=n_components, doc_topic_prior=prior, topic_word_prior=prior, random_state=0) lda_2 = LatentDirichletAllocation(n_components=n_components, random_state=0) topic_distr_1 = lda_1.fit_transform(X) topic_distr_2 = lda_2.fit_transform(X) assert_almost_equal(topic_distr_1, topic_distr_2)
Example #27
Source File: test_text2mat.py From hypertools with MIT License | 5 votes |
def test_LDA_class(): assert text2mat(data, semantic=LatentDirichletAllocation, corpus=data)[0].shape[1]==10
Example #28
Source File: topic.py From Python-DevOps with MIT License | 5 votes |
def train_lda(corpus,n_topics=10, max_df=0.95, min_df=2,cleaning=clearstring,stop_words='english'): if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) tf_vectorizer = CountVectorizer(max_df=max_df, min_df=min_df, stop_words=stop_words) tf = tf_vectorizer.fit_transform(corpus) tf_features = tf_vectorizer.get_feature_names() lda = LatentDirichletAllocation(n_topics=n_topics, max_iter = 5, learning_method = 'online', learning_offset=50., random_state=0).fit(tf) return TOPIC(tf_features,lda)
Example #29
Source File: LatentDirichletAllocation.py From mltk-algo-contrib with Apache License 2.0 | 5 votes |
def __init__(self, options): self.handle_options(options) out_params = convert_params( options.get('params', {}), floats=['doc_topic_prior','learning_decay','learning_offset','perp_tol','mean_change_tol'], strs=['learning_method'], ints=['k','max_iter','batch_size','evaluate_every','total_samples','max_doc_update_iter','n_jobs','verbose','random_state'], aliases={'k': 'n_topics'} ) self.estimator = _LatentDirichletAllocation(**out_params)
Example #30
Source File: build_lda_model.py From altair with Apache License 2.0 | 5 votes |
def build_lda_model(code_scripts_list,topics,vocab,use_binary=False,n_jobs=1): # Vectorize the python scripts with bag of words bow_model = CountVectorizer(analyzer="word", vocabulary=vocab, binary=use_binary) bow_vector_values = bow_model.transform(code_scripts_list).toarray() # Train/Fit LDA lda_model = LatentDirichletAllocation(n_topics=topics,learning_method="online",random_state=0,n_jobs=1) lda_model.fit(bow_vector_values) return lda_model