Python sklearn.decomposition.TruncatedSVD() Examples
The following are 30
code examples of sklearn.decomposition.TruncatedSVD().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.decomposition
, or try the search function
.
Example #1
Source File: recommender.py From atap with Apache License 2.0 | 7 votes |
def fit_transform(self, documents): # Vectorizer will be False if pipeline hasn't been fit yet, # Trigger fit_transform and save the vectorizer and lexicon. if self.vectorizer == False: self.lexicon = self.pipeline.fit_transform(documents) self.vect = self.pipeline.named_steps['tfidf'] self.knn = self.pipeline.named_steps['knn'] self.save() # If there's a stored vectorizer and prefitted lexicon, # use them instead. else: self.vect = self.vectorizer self.knn = Pipeline([ ('svd', TruncatedSVD(n_components=100)), ('knn', KNNTransformer(k=self.k, algorithm='ball_tree')) ]) self.knn.fit_transform(self.lexicon)
Example #2
Source File: test_decompose.py From skutil with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_selective_tsvd(): original = X cols = [original.columns[0], original.columns[1]] # Only perform on first two columns... compare_cols = np.array( original[['petal length (cm)', 'petal width (cm)']].as_matrix()) # should be the same as the trans cols transformer = SelectiveTruncatedSVD(cols=cols, n_components=1).fit(original) transformed = transformer.transform(original) untouched_cols = np.array(transformed[['petal length (cm)', 'petal width (cm)']].as_matrix()) assert_array_almost_equal(compare_cols, untouched_cols) assert 'Concept1' in transformed.columns assert transformed.shape[1] == 3 assert isinstance(transformer.get_decomposition(), TruncatedSVD) assert SelectiveTruncatedSVD().get_decomposition() is None # default None # test the selective mixin assert isinstance(transformer.cols, list)
Example #3
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def transform(self): ## get common vocabulary tfidf = self._init_word_ngram_tfidf(self.ngram) tfidf.fit(list(self.obs_corpus) + list(self.target_corpus)) vocabulary = tfidf.vocabulary_ ## obs tfidf tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary) X_obs = tfidf.fit_transform(self.obs_corpus) ## targetument tfidf tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary) X_target = tfidf.fit_transform(self.target_corpus) ## svd svd = TruncatedSVD(n_components = self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED) svd.fit(scipy.sparse.vstack((X_obs, X_target))) X_obs = svd.transform(X_obs) X_target = svd.transform(X_target) ## cosine similarity sim = list(map(dist_utils._cosine_sim, X_obs, X_target)) sim = np.asarray(sim).squeeze() return sim
Example #4
Source File: test_forest.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_random_hasher(): # test random forest hashing on circles dataset # make sure that it is linearly separable. # even after projected to two SVD dimensions # Note: Not all random_states produce perfect results. hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) X, y = datasets.make_circles(factor=0.5) X_transformed = hasher.fit_transform(X) # test fit and transform: hasher = RandomTreesEmbedding(n_estimators=30, random_state=1) assert_array_equal(hasher.fit(X).transform(X).toarray(), X_transformed.toarray()) # one leaf active per data point per forest assert_equal(X_transformed.shape[0], X.shape[0]) assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators) svd = TruncatedSVD(n_components=2) X_reduced = svd.fit_transform(X_transformed) linear_clf = LinearSVC() linear_clf.fit(X_reduced, y) assert_equal(linear_clf.score(X_reduced, y), 1.)
Example #5
Source File: feature_vector_space.py From kaggle-HomeDepot with MIT License | 6 votes |
def transform(self): # ngrams obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus)) target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus)) # cooccurrence ngrams cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams)) ## tfidf tfidf = self._init_word_ngram_tfidf(ngram=1) X = tfidf.fit_transform(cooc_terms) ## svd svd = TruncatedSVD(n_components=self.svd_dim, n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED) return svd.fit_transform(X) # 2nd in CrowdFlower (preprocessing_mikhail.py)
Example #6
Source File: feathernode.py From karateclub with GNU General Public License v3.0 | 6 votes |
def _reduce_dimensions(self, X): """ Using Truncated SVD. Arg types: * **X** *(Scipy COO or Numpy array)* - The wide feature matrix. Return types: * **X** *(Numpy array)* - The reduced feature matrix of nodes. """ svd = TruncatedSVD(n_components=self.reduction_dimensions, n_iter=self.svd_iterations, random_state=self.seed) svd.fit(X) X = svd.transform(X) return X
Example #7
Source File: recommender.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, k=3, **kwargs): self.k = k self.pipeline = Pipeline([ ('norm', TextNormalizer(minimum=10, maximum=100)), ('tfidf', TfidfVectorizer()), ('knn', Pipeline([ ('svd', TruncatedSVD(n_components=100)), ('model', KNNTransformer(k=self.k, algorithm='ball_tree')) ])) ]) self.lex_path = "lexicon.pkl" self.vect_path = "vect.pkl" self.vectorizer = False self.lexicon = None self.load()
Example #8
Source File: AE_ts_model.py From AE_ts with MIT License | 6 votes |
def plot_z_run(z_run, label, ): f1, ax1 = plt.subplots(2, 1) # First fit a PCA PCA_model = TruncatedSVD(n_components=3).fit(z_run) z_run_reduced = PCA_model.transform(z_run) ax1[0].scatter(z_run_reduced[:, 0], z_run_reduced[:, 1], c=label, marker='*', linewidths=0) ax1[0].set_title('PCA on z_run') # THen fit a tSNE tSNE_model = TSNE(verbose=2, perplexity=80, min_grad_norm=1E-12, n_iter=3000) z_run_tsne = tSNE_model.fit_transform(z_run) ax1[1].scatter(z_run_tsne[:, 0], z_run_tsne[:, 1], c=label, marker='*', linewidths=0) ax1[1].set_title('tSNE on z_run') plt.show() return
Example #9
Source File: topics.py From atap with Apache License 2.0 | 6 votes |
def __init__(self, n_topics=50, estimator='LDA'): """ n_topics is the desired number of topics To use Latent Semantic Analysis, set estimator to 'LSA', To use Non-Negative Matrix Factorization, set estimator to 'NMF', otherwise, defaults to Latent Dirichlet Allocation ('LDA'). """ self.n_topics = n_topics if estimator == 'LSA': self.estimator = TruncatedSVD(n_components=self.n_topics) elif estimator == 'NMF': self.estimator = NMF(n_components=self.n_topics) else: self.estimator = LatentDirichletAllocation(n_topics=self.n_topics) self.model = Pipeline([ ('norm', TextNormalizer()), ('tfidf', CountVectorizer(tokenizer=identity, preprocessor=None, lowercase=False)), ('model', self.estimator) ])
Example #10
Source File: build.py From atap with Apache License 2.0 | 6 votes |
def create_pipeline(estimator, reduction=False): steps = [ ('normalize', TextNormalizer()), ('vectorize', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False )) ] if reduction: steps.append(( 'reduction', TruncatedSVD(n_components=10000) )) # Add the estimator steps.append(('classifier', estimator)) return Pipeline(steps)
Example #11
Source File: run.py From themarketingtechnologist with Apache License 2.0 | 6 votes |
def reduce_dimensionality(X, n_features): """ Apply PCA or SVD to reduce dimension to n_features. :param X: :param n_features: :return: """ # Initialize reduction method: PCA or SVD # reducer = PCA(n_components=n_features) reducer = TruncatedSVD(n_components=n_features) # Fit and transform data to n_features-dimensional space reducer.fit(X) X = reducer.transform(X) logging.debug("Reduced number of features to {0}".format(n_features)) logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum()) return X
Example #12
Source File: tadw.py From karateclub with GNU General Public License v3.0 | 6 votes |
def _create_reduced_features(self, X): """ Creating a dense reduced node feature matrix. Arg types: * **X** *(Scipy COO or Numpy array)* - The wide feature matrix. Return types: * **T** *(Numpy array)* - The reduced feature matrix of nodes. """ svd = TruncatedSVD(n_components=self.reduction_dimensions, n_iter=self.svd_iterations, random_state=self.seed) svd.fit(X) T = svd.transform(X) return T.T
Example #13
Source File: reduce_kNN.py From practicalDataAnalysisCookbook with GNU General Public License v2.0 | 6 votes |
def fit_truncatedSVD(data): ''' Fit the model with truncated SVD principal components ''' # keyword parameters for the PCA kwrd_params = { 'algorithm': 'randomized', 'n_components': 5, 'n_iter': 5, 'random_state': 42, 'tol': 0.0 } # reduce the data reduced = reduceDimensions(cd.TruncatedSVD, data, **kwrd_params) # prepare the data for the classifier data_l = prepare_data(data, reduced, kwrd_params['n_components']) # fit the model class_fit_predict_print(data_l) # the file name of the dataset
Example #14
Source File: test_embeddingsResolver.py From scattertext with Apache License 2.0 | 6 votes |
def test_resolve_embeddings(self): tdm = self.corpus.get_unigram_corpus().select(ClassPercentageCompactor(term_count=1)) embeddings_resolver = EmbeddingsResolver(tdm) # embeddings = TruncatedSVD(n_components=20).fit_transform(tdm.get_term_doc_mat().T).T # embeddings_resolver.set_embeddings(embeddings) embeddings_resolver = embeddings_resolver.set_embeddings(tdm.get_term_doc_mat()) if self.assertRaisesRegex: with self.assertRaisesRegex(Exception, "You have already set embeddings by running set_embeddings or set_embeddings_model."): embeddings_resolver.set_embeddings_model(None) embeddings_resolver = EmbeddingsResolver(tdm) embeddings_resolver = embeddings_resolver.set_embeddings_model(MockWord2Vec(tdm.get_terms())) if self.assertRaisesRegex: with self.assertRaisesRegex(Exception, "You have already set embeddings by running set_embeddings or set_embeddings_model."): embeddings_resolver.set_embeddings(tdm.get_term_doc_mat()) c, axes = embeddings_resolver.project_embeddings(projection_model=TruncatedSVD(3)) self.assertIsInstance(c, ParsedCorpus) self.assertEqual(axes.to_dict(), pd.DataFrame(index=['speak'], data={'x': [0.,], 'y':[0.,]}).to_dict())
Example #15
Source File: test_truncated_svd.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_truncated_svd_eq_pca(): # TruncatedSVD should be equal to PCA on centered data X_c = X - X.mean(axis=0) params = dict(n_components=10, random_state=42) svd = TruncatedSVD(algorithm='arpack', **params) pca = PCA(svd_solver='arpack', **params) Xt_svd = svd.fit_transform(X_c) Xt_pca = pca.fit_transform(X_c) assert_allclose(Xt_svd, Xt_pca, rtol=1e-9) assert_allclose(pca.mean_, 0, atol=1e-9) assert_allclose(svd.components_, pca.components_)
Example #16
Source File: grarep.py From GraRep with GNU General Public License v3.0 | 6 votes |
def optimize(self): """ Learning an embedding. """ print("\nOptimization started.\n") self.embeddings = [] for step in tqdm(range(self.args.order)): target_matrix = self._create_target_matrix() svd = TruncatedSVD(n_components=self.args.dimensions, n_iter=self.args.iterations, random_state=self.args.seed) svd.fit(target_matrix) embedding = svd.transform(target_matrix) self.embeddings.append(embedding)
Example #17
Source File: processing.py From CAIL2019 with MIT License | 6 votes |
def do_tfidf_feature(df, tfidf): n_components = 30 svd = TruncatedSVD( n_components=n_components, algorithm="arpack", random_state=2019 ) col_tfidf = tfidf.transform(df["col"]) feature_names = tfidf.get_feature_names() ret_df = pd.DataFrame(col_tfidf.toarray(), columns=feature_names) return ret_df col_svd = svd.fit_transform(col_tfidf) best_fearures = [ feature_names[i] + "i" for i in svd.components_[0].argsort()[::-1] ] ret_df = pd.DataFrame(col_svd, columns=best_fearures[:n_components]) return ret_df
Example #18
Source File: ml_tune.py From ml-parameter-optimization with MIT License | 6 votes |
def dim_reduction_method(self): """ select dimensionality reduction method """ if self.dim_reduction=='pca': return PCA() elif self.dim_reduction=='factor-analysis': return FactorAnalysis() elif self.dim_reduction=='fast-ica': return FastICA() elif self.dim_reduction=='kernel-pca': return KernelPCA() elif self.dim_reduction=='sparse-pca': return SparsePCA() elif self.dim_reduction=='truncated-svd': return TruncatedSVD() elif self.dim_reduction!=None: raise ValueError('%s is not a supported dimensionality reduction method. Valid inputs are: \ "pca","factor-analysis","fast-ica,"kernel-pca","sparse-pca","truncated-svd".' %(self.dim_reduction))
Example #19
Source File: topic.py From Python-DevOps with MIT License | 5 votes |
def train_lsa(corpus,n_topics, max_df=0.95, min_df=2,cleaning=clearstring,stop_words='english'): if cleaning is not None: for i in range(len(corpus)): corpus[i] = cleaning(corpus[i]) tfidf_vectorizer = TfidfVectorizer(max_df = max_df, min_df = min_df, stop_words = stop_words) tfidf = tfidf_vectorizer.fit_transform(corpus) tfidf_features = tfidf_vectorizer.get_feature_names() tfidf = Normalizer().fit_transform(tfidf) lsa = TruncatedSVD(n_topics).fit(tfidf) return TOPIC(tfidf_features,lsa)
Example #20
Source File: word_utils.py From embedding with MIT License | 5 votes |
def latent_semantic_analysis(corpus_fname, output_fname): make_save_path(output_fname) corpus = [sent.replace('\n', '').strip() for sent in open(corpus_fname, 'r').readlines()] # construct co-occurrence matrix (=word_context) # dynamic weight if True. co-occurrence weight = [1, (w-1)/w, (w-2)/w, ... 1/w] input_matrix, idx2vocab = sent_to_word_contexts_matrix( corpus, windows=3, min_tf=10, dynamic_weight=True, verbose=True) # compute truncated SVD cooc_svd = TruncatedSVD(n_components=100) cooc_vecs = cooc_svd.fit_transform(input_matrix) with open(output_fname + "-cooc.vecs", 'w') as f1: for word, vec in zip(idx2vocab, cooc_vecs): str_vec = [str(el) for el in vec] f1.writelines(word + ' ' + ' '.join(str_vec) + "\n") # Shift PPMI at k=0, (equal PPMI) # pmi(word, contexts) # px: Probability of rows(items) # py: Probability of columns(features) pmi_matrix, _, _ = pmi(input_matrix, min_pmi=math.log(5)) # compute truncated SVD pmi_svd = TruncatedSVD(n_components=100) pmi_vecs = pmi_svd.fit_transform(input_matrix) with open(output_fname + "-pmi.vecs", 'w') as f2: for word, vec in zip(idx2vocab, pmi_vecs): str_vec = [str(el) for el in vec] f2.writelines(word + ' ' + ' '.join(str_vec) + "\n")
Example #21
Source File: TruncatedSVD.py From mltk-algo-contrib with Apache License 2.0 | 5 votes |
def __init__(self, options): self.handle_options(options) out_params = convert_params( options.get('params', {}), floats=['tol'], strs=['algorithm'], ints=['k','n_iter','random_state'], aliases={'k': 'n_components'} ) self.estimator = _TruncatedSVD(**out_params)
Example #22
Source File: firmware_clustering.py From Firmware_Slap with GNU General Public License v3.0 | 5 votes |
def single_cluster(all_functions, centroid_count=2): vect, func_sparse = funcs_to_sparse(all_functions) transformer = Normalizer().fit(func_sparse) func_sparse = transformer.transform(func_sparse) # svd = TruncatedSVD(random_state=2) # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) # func_sparse = svd.fit_transform(func_sparse) labels = [] result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse) score = silhouette_score(func_sparse, result.labels_, metric="cosine", random_state=2, sample_size=5000) labels.append(result.labels_) print("Clusters {:<3} | Silhoette Score : {}".format( centroid_count, score)) return result.labels_
Example #23
Source File: sent_utils.py From embedding with MIT License | 5 votes |
def latent_semantic_analysis(corpus_fname, output_fname, tokenizer_name="mecab"): make_save_path(output_fname) tokenizer = get_tokenizer(tokenizer_name) titles, raw_corpus, noun_corpus = [], [], [] with open(corpus_fname, 'r', encoding='utf-8') as f: for line in f: try: title, document = line.strip().split("\u241E") titles.append(title) raw_corpus.append(document) nouns = tokenizer.nouns(document) noun_corpus.append(' '.join(nouns)) except: continue # construct tf-idf matrix vectorizer = TfidfVectorizer( min_df=1, ngram_range=(1, 1), lowercase=True, tokenizer=lambda x: x.split()) input_matrix = vectorizer.fit_transform(noun_corpus) # compute truncated SVD svd = TruncatedSVD(n_components=100) vecs = svd.fit_transform(input_matrix) with open(output_fname, 'w') as f: for doc_idx, vec in enumerate(vecs): str_vec = [str(el) for el in vec] f.writelines(titles[doc_idx] + "\u241E" + raw_corpus[doc_idx] + '\u241E' + ' '.join(str_vec) + "\n")
Example #24
Source File: cluster.py From text-classifier with Apache License 2.0 | 5 votes |
def show_plt(feature_matrix, labels): from sklearn.decomposition import TruncatedSVD import matplotlib.pyplot as plt svd = TruncatedSVD() plot_columns = svd.fit_transform(feature_matrix) plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=labels) plt.show()
Example #25
Source File: learn.py From partisan-discourse with Apache License 2.0 | 5 votes |
def construct_pipeline(classifier): """ This function creates a feature extraction pipeline that accepts data from a CorpusLoader and appends the classification model to the end of the pipeline, returning a newly constructed Pipeline object that is ready to be fit and trained! """ return Pipeline([ # Create a Feature Union of Text Stats and Bag of Words ('union', FeatureUnion( transformer_list = [ # Pipeline for pulling document structure features ('stats', Pipeline([ ('stats', TextStats()), ('vect', DictVectorizer()), ])), # Pipeline for creating a bag of words TF-IDF vector ('bow', Pipeline([ ('tokens', TextNormalizer()), ('tfidf', TfidfVectorizer( tokenizer=identity, preprocessor=None, lowercase=False )), ('best', TruncatedSVD(n_components=1000)), ])), ], # weight components in feature union transformer_weights = { 'stats': 0.15, 'bow': 0.85, }, )), # Append the estimator to the end of the pipeline ('classifier', classifier), ])
Example #26
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def create_features_from_dataframe(self, df_train: pd.DataFrame, df_test: pd.DataFrame): train_length = len(df_train) n_components = 30 df_data: pd.DataFrame = pd.concat([df_train, df_test]) pipeline = make_pipeline( OneHotEncoder(), TfidfTransformer(), TruncatedSVD(n_components=30, random_state=71) ) features = pipeline.fit_transform(df_data[['ip', 'app', 'os', 'device', 'channel']].values).astype(np.float32) feature_columns = [] for i in range(n_components): feature_columns.append(self.name + '_{}'.format(i)) return pd.DataFrame(data=features[:train_length], columns=feature_columns), \ pd.DataFrame(data=features[train_length:], columns=feature_columns)
Example #27
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def create_features_from_dataframe(self, df_train: pd.DataFrame, df_test: pd.DataFrame): train_length = len(df_train) n_components = 30 df_data: pd.DataFrame = pd.concat([df_train, df_test]) pipeline = make_pipeline( OneHotEncoder(), TruncatedSVD(n_components=n_components, random_state=71) ) features = pipeline.fit_transform(df_data[['ip', 'app', 'os', 'device', 'channel']].values).astype(np.float32) feature_columns = [] for i in range(n_components): feature_columns.append(self.name + '_{}'.format(i)) return pd.DataFrame(data=features[:train_length], columns=feature_columns), \ pd.DataFrame(data=features[train_length:], columns=feature_columns)
Example #28
Source File: category_vector.py From talkingdata-adtracking-fraud-detection with MIT License | 5 votes |
def transformer_factory(self) -> TransformerMixin: return TruncatedSVD(n_components=self.width, random_state=71)
Example #29
Source File: firmware_clustering.py From Firmware_Slap with GNU General Public License v3.0 | 5 votes |
def get_single_cluster(all_functions, centroid_count=2): return_dict = {} vect, func_sparse = funcs_to_sparse(all_functions) transformer = Normalizer().fit(func_sparse) func_sparse = transformer.transform(func_sparse) # svd = TruncatedSVD(random_state=2) # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42) # func_sparse = svd.fit_transform(func_sparse) labels = [] result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse) score = silhouette_score(func_sparse, result.labels_, metric="cosine", random_state=2, sample_size=5000) labels.append(result.labels_) #print("Clusters {:<3} | Silhoette Score : {}".format(centroid_count, score)) return_dict['count'] = centroid_count return_dict['score'] = score return_dict['labels'] = result.labels_ return return_dict
Example #30
Source File: text_char_tfidf_count_transformers.py From driverlessai-recipes with Apache License 2.0 | 5 votes |
def fit_transform(self, X: dt.Frame, y: np.array = None): X = X.to_pandas().astype(str).iloc[:, 0].fillna("NA") # Count Vectorizer self.cnt_vec = CountVectorizer(analyzer="char", ngram_range=(1, self.max_ngram)) X = self.cnt_vec.fit_transform(X) # Truncated SVD if len(self.cnt_vec.vocabulary_) <= self.n_svd_comp: self.n_svd_comp = len(self.cnt_vec.vocabulary_) - 1 self.truncated_svd = TruncatedSVD(n_components=self.n_svd_comp, random_state=2019) X = self.truncated_svd.fit_transform(X) return X