Python sklearn.decomposition.TruncatedSVD() Examples

The following are 30 code examples of sklearn.decomposition.TruncatedSVD(). You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may also want to check out all available functions/classes of the module sklearn.decomposition , or try the search function .
Example #1
Source File: recommender.py    From atap with Apache License 2.0 7 votes vote down vote up
def fit_transform(self, documents):
        # Vectorizer will be False if pipeline hasn't been fit yet,
        # Trigger fit_transform and save the vectorizer and lexicon.
        if self.vectorizer == False:
            self.lexicon = self.pipeline.fit_transform(documents)
            self.vect = self.pipeline.named_steps['tfidf']
            self.knn = self.pipeline.named_steps['knn']
            self.save()
        # If there's a stored vectorizer and prefitted lexicon,
        # use them instead.
        else:
            self.vect = self.vectorizer
            self.knn = Pipeline([
                ('svd', TruncatedSVD(n_components=100)),
                ('knn', KNNTransformer(k=self.k, algorithm='ball_tree'))
            ])
            self.knn.fit_transform(self.lexicon) 
Example #2
Source File: test_decompose.py    From skutil with BSD 3-Clause "New" or "Revised" License 6 votes vote down vote up
def test_selective_tsvd():
    original = X
    cols = [original.columns[0], original.columns[1]]  # Only perform on first two columns...
    compare_cols = np.array(
        original[['petal length (cm)', 'petal width (cm)']].as_matrix())  # should be the same as the trans cols

    transformer = SelectiveTruncatedSVD(cols=cols, n_components=1).fit(original)
    transformed = transformer.transform(original)

    untouched_cols = np.array(transformed[['petal length (cm)', 'petal width (cm)']].as_matrix())
    assert_array_almost_equal(compare_cols, untouched_cols)
    assert 'Concept1' in transformed.columns
    assert transformed.shape[1] == 3
    assert isinstance(transformer.get_decomposition(), TruncatedSVD)
    assert SelectiveTruncatedSVD().get_decomposition() is None  # default None

    # test the selective mixin
    assert isinstance(transformer.cols, list) 
Example #3
Source File: feature_vector_space.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def transform(self):
        ## get common vocabulary
        tfidf = self._init_word_ngram_tfidf(self.ngram)
        tfidf.fit(list(self.obs_corpus) + list(self.target_corpus))
        vocabulary = tfidf.vocabulary_
        ## obs tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_obs = tfidf.fit_transform(self.obs_corpus)
        ## targetument tfidf
        tfidf = self._init_word_ngram_tfidf(self.ngram, vocabulary)
        X_target = tfidf.fit_transform(self.target_corpus)
        ## svd
        svd = TruncatedSVD(n_components = self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        svd.fit(scipy.sparse.vstack((X_obs, X_target)))
        X_obs = svd.transform(X_obs)
        X_target = svd.transform(X_target)
        ## cosine similarity
        sim = list(map(dist_utils._cosine_sim, X_obs, X_target))
        sim = np.asarray(sim).squeeze()
        return sim 
Example #4
Source File: test_forest.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_random_hasher():
    # test random forest hashing on circles dataset
    # make sure that it is linearly separable.
    # even after projected to two SVD dimensions
    # Note: Not all random_states produce perfect results.
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    X, y = datasets.make_circles(factor=0.5)
    X_transformed = hasher.fit_transform(X)

    # test fit and transform:
    hasher = RandomTreesEmbedding(n_estimators=30, random_state=1)
    assert_array_equal(hasher.fit(X).transform(X).toarray(),
                       X_transformed.toarray())

    # one leaf active per data point per forest
    assert_equal(X_transformed.shape[0], X.shape[0])
    assert_array_equal(X_transformed.sum(axis=1), hasher.n_estimators)
    svd = TruncatedSVD(n_components=2)
    X_reduced = svd.fit_transform(X_transformed)
    linear_clf = LinearSVC()
    linear_clf.fit(X_reduced, y)
    assert_equal(linear_clf.score(X_reduced, y), 1.) 
Example #5
Source File: feature_vector_space.py    From kaggle-HomeDepot with MIT License 6 votes vote down vote up
def transform(self):
        # ngrams
        obs_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.obs_ngram, "_"), self.obs_corpus))
        target_ngrams = list(map(lambda x: ngram_utils._ngrams(x.split(" "), self.target_ngram, "_"), self.target_corpus))
        # cooccurrence ngrams
        cooc_terms = list(map(lambda lst1,lst2: self._get_cooc_terms(lst1, lst2, "X"), obs_ngrams, target_ngrams))
        ## tfidf
        tfidf = self._init_word_ngram_tfidf(ngram=1)
        X = tfidf.fit_transform(cooc_terms)
        ## svd
        svd = TruncatedSVD(n_components=self.svd_dim, 
                n_iter=self.svd_n_iter, random_state=config.RANDOM_SEED)
        return svd.fit_transform(X)


# 2nd in CrowdFlower (preprocessing_mikhail.py) 
Example #6
Source File: feathernode.py    From karateclub with GNU General Public License v3.0 6 votes vote down vote up
def _reduce_dimensions(self, X):
        """
        Using Truncated SVD.

        Arg types:
            * **X** *(Scipy COO or Numpy array)* - The wide feature matrix.

        Return types:
            * **X** *(Numpy array)* - The reduced feature matrix of nodes.
        """
        svd = TruncatedSVD(n_components=self.reduction_dimensions,
                           n_iter=self.svd_iterations,
                           random_state=self.seed)
        svd.fit(X)
        X = svd.transform(X)
        return X 
Example #7
Source File: recommender.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, k=3, **kwargs):
        self.k = k
        self.pipeline = Pipeline([
            ('norm', TextNormalizer(minimum=10, maximum=100)),
            ('tfidf', TfidfVectorizer()),
            ('knn', Pipeline([
                ('svd', TruncatedSVD(n_components=100)),
                ('model', KNNTransformer(k=self.k, algorithm='ball_tree'))
            ]))
        ])

        self.lex_path = "lexicon.pkl"
        self.vect_path = "vect.pkl"
        self.vectorizer = False
        self.lexicon = None
        self.load() 
Example #8
Source File: AE_ts_model.py    From AE_ts with MIT License 6 votes vote down vote up
def plot_z_run(z_run, label, ):
    f1, ax1 = plt.subplots(2, 1)

    # First fit a PCA
    PCA_model = TruncatedSVD(n_components=3).fit(z_run)
    z_run_reduced = PCA_model.transform(z_run)
    ax1[0].scatter(z_run_reduced[:, 0], z_run_reduced[:, 1], c=label, marker='*', linewidths=0)
    ax1[0].set_title('PCA on z_run')

    # THen fit a tSNE
    tSNE_model = TSNE(verbose=2, perplexity=80, min_grad_norm=1E-12, n_iter=3000)
    z_run_tsne = tSNE_model.fit_transform(z_run)
    ax1[1].scatter(z_run_tsne[:, 0], z_run_tsne[:, 1], c=label, marker='*', linewidths=0)
    ax1[1].set_title('tSNE on z_run')

    plt.show()
    return 
Example #9
Source File: topics.py    From atap with Apache License 2.0 6 votes vote down vote up
def __init__(self, n_topics=50, estimator='LDA'):
        """
        n_topics is the desired number of topics
        To use Latent Semantic Analysis, set estimator to 'LSA',
        To use Non-Negative Matrix Factorization, set estimator to 'NMF',
        otherwise, defaults to Latent Dirichlet Allocation ('LDA').
        """
        self.n_topics = n_topics

        if estimator == 'LSA':
            self.estimator = TruncatedSVD(n_components=self.n_topics)
        elif estimator == 'NMF':
            self.estimator = NMF(n_components=self.n_topics)
        else:
            self.estimator = LatentDirichletAllocation(n_topics=self.n_topics)

        self.model = Pipeline([
            ('norm', TextNormalizer()),
            ('tfidf', CountVectorizer(tokenizer=identity,
                                      preprocessor=None, lowercase=False)),
            ('model', self.estimator)
        ]) 
Example #10
Source File: build.py    From atap with Apache License 2.0 6 votes vote down vote up
def create_pipeline(estimator, reduction=False):

    steps = [
        ('normalize', TextNormalizer()),
        ('vectorize', TfidfVectorizer(
            tokenizer=identity, preprocessor=None, lowercase=False
        ))
    ]

    if reduction:
        steps.append((
            'reduction', TruncatedSVD(n_components=10000)
        ))

    # Add the estimator
    steps.append(('classifier', estimator))
    return Pipeline(steps) 
Example #11
Source File: run.py    From themarketingtechnologist with Apache License 2.0 6 votes vote down vote up
def reduce_dimensionality(X, n_features):
        """
        Apply PCA or SVD to reduce dimension to n_features.
        :param X:
        :param n_features:
        :return:
        """
        # Initialize reduction method: PCA or SVD
        # reducer = PCA(n_components=n_features)
        reducer = TruncatedSVD(n_components=n_features)
        # Fit and transform data to n_features-dimensional space
        reducer.fit(X)
        X = reducer.transform(X)
        logging.debug("Reduced number of features to {0}".format(n_features))
        logging.debug("Percentage explained: %s\n" % reducer.explained_variance_ratio_.sum())
        return X 
Example #12
Source File: tadw.py    From karateclub with GNU General Public License v3.0 6 votes vote down vote up
def _create_reduced_features(self, X):
        """
        Creating a dense reduced node feature matrix.

        Arg types:
            * **X** *(Scipy COO or Numpy array)* - The wide feature matrix.

        Return types:
            * **T** *(Numpy array)* - The reduced feature matrix of nodes.
        """
        svd = TruncatedSVD(n_components=self.reduction_dimensions,
                           n_iter=self.svd_iterations,
                           random_state=self.seed)
        svd.fit(X)
        T = svd.transform(X)
        return T.T 
Example #13
Source File: reduce_kNN.py    From practicalDataAnalysisCookbook with GNU General Public License v2.0 6 votes vote down vote up
def fit_truncatedSVD(data):
    '''
        Fit the model with truncated SVD principal components
    '''
    # keyword parameters for the PCA
    kwrd_params = {
        'algorithm': 'randomized', 
        'n_components': 5, 
        'n_iter': 5,
        'random_state': 42, 
        'tol': 0.0
    }

    # reduce the data
    reduced = reduceDimensions(cd.TruncatedSVD, 
        data, **kwrd_params)

    # prepare the data for the classifier
    data_l = prepare_data(data, reduced, 
        kwrd_params['n_components'])

    # fit the model
    class_fit_predict_print(data_l)

# the file name of the dataset 
Example #14
Source File: test_embeddingsResolver.py    From scattertext with Apache License 2.0 6 votes vote down vote up
def test_resolve_embeddings(self):
        tdm = self.corpus.get_unigram_corpus().select(ClassPercentageCompactor(term_count=1))
        embeddings_resolver = EmbeddingsResolver(tdm)
        # embeddings = TruncatedSVD(n_components=20).fit_transform(tdm.get_term_doc_mat().T).T
        # embeddings_resolver.set_embeddings(embeddings)
        embeddings_resolver = embeddings_resolver.set_embeddings(tdm.get_term_doc_mat())
        if self.assertRaisesRegex:
            with self.assertRaisesRegex(Exception,
                                        "You have already set embeddings by running set_embeddings or set_embeddings_model."):
                embeddings_resolver.set_embeddings_model(None)
        embeddings_resolver = EmbeddingsResolver(tdm)

        embeddings_resolver = embeddings_resolver.set_embeddings_model(MockWord2Vec(tdm.get_terms()))
        if self.assertRaisesRegex:
            with self.assertRaisesRegex(Exception,
                                        "You have already set embeddings by running set_embeddings or set_embeddings_model."):
                embeddings_resolver.set_embeddings(tdm.get_term_doc_mat())
        c, axes = embeddings_resolver.project_embeddings(projection_model=TruncatedSVD(3))
        self.assertIsInstance(c, ParsedCorpus)
        self.assertEqual(axes.to_dict(), pd.DataFrame(index=['speak'], data={'x': [0.,], 'y':[0.,]}).to_dict()) 
Example #15
Source File: test_truncated_svd.py    From Mastering-Elasticsearch-7.0 with MIT License 6 votes vote down vote up
def test_truncated_svd_eq_pca():
    # TruncatedSVD should be equal to PCA on centered data

    X_c = X - X.mean(axis=0)

    params = dict(n_components=10, random_state=42)

    svd = TruncatedSVD(algorithm='arpack', **params)
    pca = PCA(svd_solver='arpack', **params)

    Xt_svd = svd.fit_transform(X_c)
    Xt_pca = pca.fit_transform(X_c)

    assert_allclose(Xt_svd, Xt_pca, rtol=1e-9)
    assert_allclose(pca.mean_, 0, atol=1e-9)
    assert_allclose(svd.components_, pca.components_) 
Example #16
Source File: grarep.py    From GraRep with GNU General Public License v3.0 6 votes vote down vote up
def optimize(self):
        """
        Learning an embedding.
        """
        print("\nOptimization started.\n")
        self.embeddings = []
        for step in tqdm(range(self.args.order)):
            target_matrix = self._create_target_matrix()

            svd = TruncatedSVD(n_components=self.args.dimensions,
                               n_iter=self.args.iterations,
                               random_state=self.args.seed)

            svd.fit(target_matrix)
            embedding = svd.transform(target_matrix)
            self.embeddings.append(embedding) 
Example #17
Source File: processing.py    From CAIL2019 with MIT License 6 votes vote down vote up
def do_tfidf_feature(df, tfidf):
    n_components = 30
    svd = TruncatedSVD(
        n_components=n_components, algorithm="arpack", random_state=2019
    )

    col_tfidf = tfidf.transform(df["col"])

    feature_names = tfidf.get_feature_names()
    ret_df = pd.DataFrame(col_tfidf.toarray(), columns=feature_names)
    return ret_df

    col_svd = svd.fit_transform(col_tfidf)

    best_fearures = [
        feature_names[i] + "i" for i in svd.components_[0].argsort()[::-1]
    ]
    ret_df = pd.DataFrame(col_svd, columns=best_fearures[:n_components])
    return ret_df 
Example #18
Source File: ml_tune.py    From ml-parameter-optimization with MIT License 6 votes vote down vote up
def dim_reduction_method(self):
        """
        select dimensionality reduction method
        """
        if self.dim_reduction=='pca':
            return PCA()
        elif self.dim_reduction=='factor-analysis':
            return FactorAnalysis()
        elif self.dim_reduction=='fast-ica':
            return FastICA()
        elif self.dim_reduction=='kernel-pca':
            return KernelPCA()
        elif self.dim_reduction=='sparse-pca':
            return SparsePCA()
        elif self.dim_reduction=='truncated-svd':
            return TruncatedSVD()
        elif self.dim_reduction!=None:
            raise ValueError('%s is not a supported dimensionality reduction method. Valid inputs are: \
                             "pca","factor-analysis","fast-ica,"kernel-pca","sparse-pca","truncated-svd".' 
                             %(self.dim_reduction)) 
Example #19
Source File: topic.py    From Python-DevOps with MIT License 5 votes vote down vote up
def train_lsa(corpus,n_topics, max_df=0.95, min_df=2,cleaning=clearstring,stop_words='english'):
    if cleaning is not None:
        for i in range(len(corpus)): corpus[i] = cleaning(corpus[i])
    tfidf_vectorizer = TfidfVectorizer(max_df = max_df, min_df = min_df, stop_words = stop_words)
    tfidf = tfidf_vectorizer.fit_transform(corpus)
    tfidf_features = tfidf_vectorizer.get_feature_names()
    tfidf = Normalizer().fit_transform(tfidf)
    lsa = TruncatedSVD(n_topics).fit(tfidf)
    return TOPIC(tfidf_features,lsa) 
Example #20
Source File: word_utils.py    From embedding with MIT License 5 votes vote down vote up
def latent_semantic_analysis(corpus_fname, output_fname):
    make_save_path(output_fname)
    corpus = [sent.replace('\n', '').strip() for sent in open(corpus_fname, 'r').readlines()]
    # construct co-occurrence matrix (=word_context)
    # dynamic weight if True. co-occurrence weight = [1, (w-1)/w, (w-2)/w, ... 1/w]
    input_matrix, idx2vocab = sent_to_word_contexts_matrix(
        corpus,
        windows=3,
        min_tf=10,
        dynamic_weight=True,
        verbose=True)
    # compute truncated SVD
    cooc_svd = TruncatedSVD(n_components=100)
    cooc_vecs = cooc_svd.fit_transform(input_matrix)
    with open(output_fname + "-cooc.vecs", 'w') as f1:
        for word, vec in zip(idx2vocab, cooc_vecs):
            str_vec = [str(el) for el in vec]
            f1.writelines(word + ' ' + ' '.join(str_vec) + "\n")
    # Shift PPMI at k=0, (equal PPMI)
    # pmi(word, contexts)
    # px: Probability of rows(items)
    # py: Probability of columns(features)
    pmi_matrix, _, _ = pmi(input_matrix, min_pmi=math.log(5))
    # compute truncated SVD
    pmi_svd = TruncatedSVD(n_components=100)
    pmi_vecs = pmi_svd.fit_transform(input_matrix)
    with open(output_fname + "-pmi.vecs", 'w') as f2:
        for word, vec in zip(idx2vocab, pmi_vecs):
            str_vec = [str(el) for el in vec]
            f2.writelines(word + ' ' + ' '.join(str_vec) + "\n") 
Example #21
Source File: TruncatedSVD.py    From mltk-algo-contrib with Apache License 2.0 5 votes vote down vote up
def __init__(self, options):
        self.handle_options(options)
        out_params = convert_params(
            options.get('params', {}),
            floats=['tol'],
            strs=['algorithm'],
            ints=['k','n_iter','random_state'],
            aliases={'k': 'n_components'}
        )

        self.estimator = _TruncatedSVD(**out_params) 
Example #22
Source File: firmware_clustering.py    From Firmware_Slap with GNU General Public License v3.0 5 votes vote down vote up
def single_cluster(all_functions, centroid_count=2):
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    # svd = TruncatedSVD(random_state=2)
    # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    # func_sparse = svd.fit_transform(func_sparse)

    labels = []

    result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)

    score = silhouette_score(func_sparse,
                             result.labels_,
                             metric="cosine",
                             random_state=2,
                             sample_size=5000)
    labels.append(result.labels_)

    print("Clusters {:<3} | Silhoette Score : {}".format(
        centroid_count, score))

    return result.labels_ 
Example #23
Source File: sent_utils.py    From embedding with MIT License 5 votes vote down vote up
def latent_semantic_analysis(corpus_fname, output_fname, tokenizer_name="mecab"):
    make_save_path(output_fname)
    tokenizer = get_tokenizer(tokenizer_name)
    titles, raw_corpus, noun_corpus = [], [], []
    with open(corpus_fname, 'r', encoding='utf-8') as f:
        for line in f:
            try:
                title, document = line.strip().split("\u241E")
                titles.append(title)
                raw_corpus.append(document)
                nouns = tokenizer.nouns(document)
                noun_corpus.append(' '.join(nouns))
            except:
                continue
    # construct tf-idf matrix
    vectorizer = TfidfVectorizer(
        min_df=1,
        ngram_range=(1, 1),
        lowercase=True,
        tokenizer=lambda x: x.split())
    input_matrix = vectorizer.fit_transform(noun_corpus)
    # compute truncated SVD
    svd = TruncatedSVD(n_components=100)
    vecs = svd.fit_transform(input_matrix)
    with open(output_fname, 'w') as f:
        for doc_idx, vec in enumerate(vecs):
            str_vec = [str(el) for el in vec]
            f.writelines(titles[doc_idx] + "\u241E" + raw_corpus[doc_idx] + '\u241E' + ' '.join(str_vec) + "\n") 
Example #24
Source File: cluster.py    From text-classifier with Apache License 2.0 5 votes vote down vote up
def show_plt(feature_matrix, labels):
    from sklearn.decomposition import TruncatedSVD
    import matplotlib.pyplot as plt
    svd = TruncatedSVD()
    plot_columns = svd.fit_transform(feature_matrix)
    plt.scatter(x=plot_columns[:, 0], y=plot_columns[:, 1], c=labels)
    plt.show() 
Example #25
Source File: learn.py    From partisan-discourse with Apache License 2.0 5 votes vote down vote up
def construct_pipeline(classifier):
    """
    This function creates a feature extraction pipeline that accepts data
    from a CorpusLoader and appends the classification model to the end of
    the pipeline, returning a newly constructed Pipeline object that is
    ready to be fit and trained!
    """

    return Pipeline([
        # Create a Feature Union of Text Stats and Bag of Words
        ('union', FeatureUnion(
            transformer_list = [

                # Pipeline for pulling document structure features
                ('stats', Pipeline([
                    ('stats', TextStats()),
                    ('vect', DictVectorizer()),
                ])),

                # Pipeline for creating a bag of words TF-IDF vector
                ('bow', Pipeline([
                    ('tokens', TextNormalizer()),
                    ('tfidf',  TfidfVectorizer(
                        tokenizer=identity, preprocessor=None, lowercase=False
                    )),
                    ('best', TruncatedSVD(n_components=1000)),
                ])),

            ],

            # weight components in feature union
            transformer_weights = {
                'stats': 0.15,
                'bow': 0.85,
            },
        )),

        # Append the estimator to the end of the pipeline
        ('classifier', classifier),
    ]) 
Example #26
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def create_features_from_dataframe(self, df_train: pd.DataFrame, df_test: pd.DataFrame):
        train_length = len(df_train)
        n_components = 30
        df_data: pd.DataFrame = pd.concat([df_train, df_test])
        pipeline = make_pipeline(
            OneHotEncoder(),
            TfidfTransformer(),
            TruncatedSVD(n_components=30, random_state=71)
        )
        features = pipeline.fit_transform(df_data[['ip', 'app', 'os', 'device', 'channel']].values).astype(np.float32)
        feature_columns = []
        for i in range(n_components):
            feature_columns.append(self.name + '_{}'.format(i))
        return pd.DataFrame(data=features[:train_length], columns=feature_columns), \
               pd.DataFrame(data=features[train_length:], columns=feature_columns) 
Example #27
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def create_features_from_dataframe(self, df_train: pd.DataFrame, df_test: pd.DataFrame):
        train_length = len(df_train)
        n_components = 30
        df_data: pd.DataFrame = pd.concat([df_train, df_test])
        pipeline = make_pipeline(
            OneHotEncoder(),
            TruncatedSVD(n_components=n_components, random_state=71)
        )
        features = pipeline.fit_transform(df_data[['ip', 'app', 'os', 'device', 'channel']].values).astype(np.float32)
        feature_columns = []
        for i in range(n_components):
            feature_columns.append(self.name + '_{}'.format(i))
        return pd.DataFrame(data=features[:train_length], columns=feature_columns), \
               pd.DataFrame(data=features[train_length:], columns=feature_columns) 
Example #28
Source File: category_vector.py    From talkingdata-adtracking-fraud-detection with MIT License 5 votes vote down vote up
def transformer_factory(self) -> TransformerMixin:
        return TruncatedSVD(n_components=self.width, random_state=71) 
Example #29
Source File: firmware_clustering.py    From Firmware_Slap with GNU General Public License v3.0 5 votes vote down vote up
def get_single_cluster(all_functions, centroid_count=2):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    # svd = TruncatedSVD(random_state=2)
    # svd = TruncatedSVD(n_components=5, n_iter=7, random_state=42)

    # func_sparse = svd.fit_transform(func_sparse)

    labels = []

    result = KMeans(n_clusters=centroid_count, random_state=2).fit(func_sparse)

    score = silhouette_score(func_sparse,
                             result.labels_,
                             metric="cosine",
                             random_state=2,
                             sample_size=5000)
    labels.append(result.labels_)

    #print("Clusters {:<3} | Silhoette Score : {}".format(centroid_count, score))
    return_dict['count'] = centroid_count
    return_dict['score'] = score
    return_dict['labels'] = result.labels_

    return return_dict 
Example #30
Source File: text_char_tfidf_count_transformers.py    From driverlessai-recipes with Apache License 2.0 5 votes vote down vote up
def fit_transform(self, X: dt.Frame, y: np.array = None):
        X = X.to_pandas().astype(str).iloc[:, 0].fillna("NA")
        # Count Vectorizer
        self.cnt_vec = CountVectorizer(analyzer="char", ngram_range=(1, self.max_ngram))
        X = self.cnt_vec.fit_transform(X)
        # Truncated SVD
        if len(self.cnt_vec.vocabulary_) <= self.n_svd_comp:
            self.n_svd_comp = len(self.cnt_vec.vocabulary_) - 1
        self.truncated_svd = TruncatedSVD(n_components=self.n_svd_comp, random_state=2019)
        X = self.truncated_svd.fit_transform(X)
        return X