Python Examples of sklearn.metrics.pairwise.cosine

Source File: test_t_sne.py From twitter-stock-recommendation with MIT License

6 votes

def test_tsne_with_different_distance_metrics():
    """Make sure that TSNE works for different distance metrics"""
    random_state = check_random_state(0)
    n_components_original = 3
    n_components_embedding = 2
    X = random_state.randn(50, n_components_original).astype(np.float32)
    metrics = ['manhattan', 'cosine']
    dist_funcs = [manhattan_distances, cosine_distances]
    for metric, dist_func in zip(metrics, dist_funcs):
        X_transformed_tsne = TSNE(
            metric=metric, n_components=n_components_embedding,
            random_state=0).fit_transform(X)
        X_transformed_tsne_precomputed = TSNE(
            metric='precomputed', n_components=n_components_embedding,
            random_state=0).fit_transform(dist_func(X))
        assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)

Source File: test_t_sne.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_tsne_with_different_distance_metrics():
    """Make sure that TSNE works for different distance metrics"""
    random_state = check_random_state(0)
    n_components_original = 3
    n_components_embedding = 2
    X = random_state.randn(50, n_components_original).astype(np.float32)
    metrics = ['manhattan', 'cosine']
    dist_funcs = [manhattan_distances, cosine_distances]
    for metric, dist_func in zip(metrics, dist_funcs):
        X_transformed_tsne = TSNE(
            metric=metric, n_components=n_components_embedding,
            random_state=0).fit_transform(X)
        X_transformed_tsne_precomputed = TSNE(
            metric='precomputed', n_components=n_components_embedding,
            random_state=0).fit_transform(dist_func(X))
        assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)

Source File: construct_hypergraph.py From DHGNN with MIT License

6 votes

def construct_H_with_KNN(X, K_neigs=[10], is_probH=False, m_prob=1):
    """
    init multi-scale hypergraph Vertex-Edge matrix from original node feature matrix
    :param X: N_object x feature_number
    :param K_neigs: the number of neighbor expansion
    :param is_probH: prob Vertex-Edge matrix or binary
    :param m_prob: prob
    :return: N_object x N_hyperedge
    """
    if len(X.shape) != 2:
        X = X.reshape(-1, X.shape[-1])

    if type(K_neigs) == int:
        K_neigs = [K_neigs]

    dis_mat = cos_dis(X)
    H = None
    for k_neig in K_neigs:
        H_tmp = construct_H_with_KNN_from_distance(dis_mat, k_neig, is_probH, m_prob)
        H = hyperedge_concat(H, H_tmp)
    return H

Source File: construct_hypergraph.py From DHGNN with MIT License

5 votes

def _construct_edge_list_from_distance(X, k_neigh):
    """
    construct edge_list (numpy array) from kNN distance for single modality
    :param X -> numpy array: feature
    :param k_neigh -> int: # of neighbors
    :return: N * k_neigh numpy array
    """
    dis = cos_dis(X)
    dis = torch.Tensor(dis)
    _, k_idx = dis.topk(k_neigh, dim=-1, largest=False)
    return k_idx.numpy()

Source File: test_pairwise.py From twitter-stock-recommendation with MIT License

5 votes

def test_cosine_distances():
    # Check the pairwise Cosine distances computation
    rng = np.random.RandomState(1337)
    x = np.abs(rng.rand(910))
    XA = np.vstack([x, x])
    D = cosine_distances(XA)
    assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
    # check that all elements are in [0, 2]
    assert_true(np.all(D >= 0.))
    assert_true(np.all(D <= 2.))
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])

    XB = np.vstack([x, -x])
    D2 = cosine_distances(XB)
    # check that all elements are in [0, 2]
    assert_true(np.all(D2 >= 0.))
    assert_true(np.all(D2 <= 2.))
    # check that diagonal elements are equal to 0 and non diagonal to 2
    assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])

    # check large random matrix
    X = np.abs(rng.rand(1000, 5000))
    D = cosine_distances(X)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
    assert_true(np.all(D >= 0.))
    assert_true(np.all(D <= 2.))


# Paired distances

Source File: ABuStatsUtil.py From abu with GNU General Public License v3.0

5 votes

def cosine_distances_xy(x, y, to_similar=False):
    """
    余弦距离计算两个序列distance，注意需要理解数据的测距目的来分析
    是否需要进行scale_start，进行和不进行scale_start的结果将完全不一样，在功能需求及数据理解的情况下
    选择是否进行scale_start
    :param x: 可迭代序列
    :param y: 可迭代序列
    :param to_similar: 是否进行后置输出转换similar值
    :return: float数值
    """
    distance = _distance_xy(cosine_distances, x, y)
    if to_similar:
        # 余弦距离转换余弦相似度直接减就行
        distance = 1.0 - distance
    return distance

Source File: baseline_tfidf.py From tg2019task with MIT License

5 votes

def main():
    import argparse

    parser = argparse.ArgumentParser()
    parser.add_argument('-n', '--nearest', type=int, default=10)
    parser.add_argument('tables')
    parser.add_argument('questions', type=argparse.FileType('r', encoding='UTF-8'))
    args = parser.parse_args()

    explanations = []

    for path, _, files in os.walk(args.tables):
        for file in files:
            explanations += read_explanations(os.path.join(path, file))

    if not explanations:
        warnings.warn('Empty explanations')

    df_q = pd.read_csv(args.questions, sep='\t', dtype=str)
    df_e = pd.DataFrame(explanations, columns=('uid', 'text'))

    vectorizer = TfidfVectorizer().fit(df_q['Question']).fit(df_e['text'])
    X_q = vectorizer.transform(df_q['Question'])
    X_e = vectorizer.transform(df_e['text'])
    X_dist = cosine_distances(X_q, X_e)

    for i_question, distances in enumerate(X_dist):
        for i_explanation in np.argsort(distances)[:args.nearest]:
            print('{}\t{}'.format(df_q.loc[i_question]['questionID'], df_e.loc[i_explanation]['uid']))

Source File: utils.py From GDAN with MIT License

5 votes

def kNN_classify(*, x, y):
    """
    return the index of y that is closest to each x
    :param x: n*d matrix
    :param y: m*d matrix
    :return: n-dim vector
    """
    ds = cosine_distances(x, y)
    idx = y[np.argmin(ds, axis=1)]
    return idx

Source File: face_recognizer.py From celeb-detection-oss with Mozilla Public License 2.0

5 votes

def _distance(x1, x2):
        return cosine_distances(x1, x2)

Source File: test_cosine_distances.py From mars with Apache License 2.0

5 votes

def testCosineDistancesExecution(self):
        raw_dense_x = np.random.rand(25, 10)
        raw_dense_y = np.random.rand(17, 10)

        raw_sparse_x = sps.random(25, 10, density=0.5, format='csr', random_state=0)
        raw_sparse_y = sps.random(17, 10, density=0.4, format='csr', random_state=1)

        for raw_x, raw_y in [
            (raw_dense_x, raw_dense_y),
            (raw_sparse_x, raw_sparse_y)
        ]:
            for chunk_size in (25, 6):
                x = mt.tensor(raw_x, chunk_size=chunk_size)
                y = mt.tensor(raw_y, chunk_size=chunk_size)

                d = cosine_distances(x, y)

                result = self.executor.execute_tensor(d, concat=True)[0]
                expected = sk_cosine_distances(raw_x, raw_y)

                np.testing.assert_almost_equal(np.asarray(result), expected)

                d = cosine_distances(x)

                result = self.executor.execute_tensor(d, concat=True)[0]
                expected = sk_cosine_distances(raw_x)

                np.testing.assert_almost_equal(np.asarray(result), expected)

Source File: metrics.py From chameleon_recsys with MIT License

5 votes

def cosine_distance(v1, v2):
    #As cosine similarity interval is [-1.0, 1.0], the cosine distance interval is [0.0, 2.0].
    #This normalizes the cosine distance to interval [0.0, 1.0]
    return pairwise.cosine_distances(v1, v2) / 2.0



#For ranks index starting from 0

Source File: firmware_clustering.py From Firmware_Slap with GNU General Public License v3.0

5 votes

def get_cosine_dist(all_functions):
    return_dict = {}
    vect, func_sparse = funcs_to_sparse(all_functions)

    transformer = Normalizer().fit(func_sparse)

    func_sparse = transformer.transform(func_sparse)

    return cosine_distances(func_sparse, func_sparse)

Source File: test_pairwise.py From Mastering-Elasticsearch-7.0 with MIT License

5 votes

def test_cosine_distances():
    # Check the pairwise Cosine distances computation
    rng = np.random.RandomState(1337)
    x = np.abs(rng.rand(910))
    XA = np.vstack([x, x])
    D = cosine_distances(XA)
    assert_array_almost_equal(D, [[0., 0.], [0., 0.]])
    # check that all elements are in [0, 2]
    assert np.all(D >= 0.)
    assert np.all(D <= 2.)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.])

    XB = np.vstack([x, -x])
    D2 = cosine_distances(XB)
    # check that all elements are in [0, 2]
    assert np.all(D2 >= 0.)
    assert np.all(D2 <= 2.)
    # check that diagonal elements are equal to 0 and non diagonal to 2
    assert_array_almost_equal(D2, [[0., 2.], [2., 0.]])

    # check large random matrix
    X = np.abs(rng.rand(1000, 5000))
    D = cosine_distances(X)
    # check that diagonal elements are equal to 0
    assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0])
    assert np.all(D >= 0.)
    assert np.all(D <= 2.)

Source File: vector_opr.py From tokenquery with GNU General Public License v3.0

5 votes

def vec_cos_dist(token_input, operation_input):
    operation_string = None
    ref_vector_string = None
    cond_value_string = None
    for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']:
        if opr_sign in operation_input:
            ref_vector_string = operation_input.split(opr_sign)[0]
            operation_string = opr_sign
            cond_value_string = operation_input.split(opr_sign)[1]
            break

    if ref_vector_string and cond_value_string and operation_string:
        try:
            cond_value = float(cond_value_string)
            ref_vector = change_string_to_vector(ref_vector_string)
            token_vector = change_string_to_vector(token_input)
            if len(ref_vector) != len(token_vector):
                print ('len of vectors does not match')
                return False
            if operation_string == "=" or operation_string == "==":
                return cosine_distances(token_vector, ref_vector) == cond_value
            elif operation_string == "<":
                return cosine_distances(token_vector, ref_vector) < cond_value
            elif operation_string == ">":
                return cosine_distances(token_vector, ref_vector) > cond_value
            elif operation_string == ">=":
                return cosine_distances(token_vector, ref_vector) >= cond_value
            elif operation_string == "<=":
                return cosine_distances(token_vector, ref_vector) <= cond_value
            elif operation_string == "!=" or operation_string == "<>":
                return cosine_distances(token_vector, ref_vector) != cond_value
            else:
                return False
        except ValueError:
            # TODO raise tokenregex error
            return False

    else:
        # TODO raise tokenregex error
        print ('Problem with the operation input')

Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0

4 votes

def latent_dirichlet_allocation(headlines, bodies):
    # https://pypi.python.org/pypi/lda on bottom see suggestions like MALLET, hca
    # https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730
    # https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text

    def print_top_words(model, feature_names, n_top_words):
        for topic_idx, topic in enumerate(model.components_):
            print("Topic #%d:" % topic_idx)
            print(", ".join([feature_names[i]
                             for i in topic.argsort()[:-n_top_words - 1:-1]]))
        print()

    def combine_head_and_body(headlines, bodies):
        head_and_body = [headline + " " + body for i, (headline, body) in
                         enumerate(zip(headlines, bodies))]

        return head_and_body

    def get_features(vocab):
        vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_head = vectorizer_head.fit_transform(headlines)

        vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2')
        X_train_body = vectorizer_body.fit_transform(bodies)

        # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The
        # more important topic words a body contains of a certain topic, the higher its value for this topic
        lda_body = LatentDirichletAllocation(n_topics=25, learning_method='online', random_state=0, n_jobs=3)

        print("latent_dirichlet_allocation: fit and transform body")
        t0 = time()
        lda_body_matrix = lda_body.fit_transform(X_train_body)
        print("done in %0.3fs." % (time() - t0))

        print("latent_dirichlet_allocation: transform head")
        # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics
        # their vectors should be similar
        lda_head_matrix = lda_body.transform(X_train_head)

        #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100)

        print('latent_dirichlet_allocation: calculating cosine distance between head and body')
        # calculate cosine distance between the body and head
        X = []
        for i in range(len(lda_head_matrix)):
            X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated
            X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1))
            cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten()
            X.append(cos_dist.tolist())
        return X


    vocab = create_word_ngram_vocabulary(ngram_range=(1, 1), max_features=5000, lemmatize=False, term_freq=True,
                                         norm='l2')
    X = get_features(vocab)
    return X

Source File: ABuStatsUtil.py From abu with GNU General Public License v3.0

4 votes

def cosine_distance_matrix(df, scale_end=True, to_similar=False):
    """
    余弦距离: 与cosine_distances_xy的区别主要是，非两两distance计算，只有一个矩阵的输入，
    且输入必须为pd.DataFrame or np.array or 多层迭代序列[[],[]]，注意需要理解数据的测距目的来分析
    是否需要进行scale_start，进行和不进行scale_start的结果将完全不一样，在功能需求及数据理解的情况下
    选择是否进行scale_start

        eg:
            input:

                        tsla	bidu	noah	sfun	goog	vips	aapl
            2014-07-25	223.57	226.50	15.32	12.110	589.02	21.349	97.67
            2014-07-28	224.82	225.80	16.13	12.450	590.60	21.548	99.02
            2014-07-29	225.01	220.00	16.75	12.220	585.61	21.190	98.38
            ...	...	...	...	...	...	...	...
            2016-07-22	222.27	160.88	25.50	4.850	742.74	13.510	98.66
            2016-07-25	230.01	160.25	25.57	4.790	739.77	13.390	97.34
            2016-07-26	225.93	163.09	24.75	4.945	740.92	13.655	97.76


            ABuStatsUtil.cosine_distance_matrix(cc, scale_start=True)

            output:

                    tsla	bidu	noah	sfun	goog	vips	aapl
            tsla	0.0000	0.1743	0.4434	0.2945	0.2394	0.4763	0.1266
            bidu	0.1743	0.0000	0.5808	0.2385	0.3986	0.3034	0.1470
            noah	0.4434	0.5808	0.0000	1.0000	0.3411	0.7626	0.2632
            sfun	0.2945	0.2385	1.0000	0.0000	0.7494	0.4448	0.4590
            goog	0.2394	0.3986	0.3411	0.7494	0.0000	0.9717	0.2806
            vips	0.4763	0.3034	0.7626	0.4448	0.9717	0.0000	0.2669
            aapl	0.1266	0.1470	0.2632	0.4590	0.2806	0.2669	0.0000


            ABuStatsUtil.cosine_distance_matrix(cc, scale_start=False)

            output:

                    tsla	bidu	noah	sfun	goog	vips	aapl
            tsla	0.0000	0.1743	0.4434	0.2945	0.2394	0.4763	0.1266
            bidu	0.1743	0.0000	0.5808	0.2385	0.3986	0.3034	0.1470
            noah	0.4434	0.5808	0.0000	1.0000	0.3411	0.7626	0.2632
            sfun	0.2945	0.2385	1.0000	0.0000	0.7494	0.4448	0.4590
            goog	0.2394	0.3986	0.3411	0.7494	0.0000	0.9717	0.2806
            vips	0.4763	0.3034	0.7626	0.4448	0.9717	0.0000	0.2669
            aapl	0.1266	0.1470	0.2632	0.4590	0.2806	0.2669	0.0000

    :param df: pd.DataFrame or np.array or 多层迭代序列[[],[]], 之所以叫df，是因为在内部会统一转换为pd.DataFrame
    :param scale_end: 对结果矩阵进行标准化处理
    :param to_similar: 是否进行后置输出转换similar值
    :return: distance_df，pd.DataFrame对象
    """
    return _distance_matrix(cosine_distances, df, scale_end, to_similar)


# TODO distance与similar之间的关系以及模块功能位置

Source File: pairplot.py From scattertext with Apache License 2.0

4 votes

def produce_category_focused_pairplot(corpus,
                                      category,
                                      category_projector=CategoryProjector(projector=TruncatedSVD(20)),
                                      category_projection=None,
                                      **kwargs):
    '''
    Produces a pair-plot which is focused on a single category.

    :param corpus: TermDocMatrix
    :param category: str, name of a category in the corpus
    :param category_projector: CategoryProjector, a factor analysis of the category/feature vector
    :param category_projection: CategoryProjection, None by default. If present, overrides category projector
    :param kwargs: remaining kwargs for produce_pairplot
    :return: str, HTML
    '''

    category_num = corpus.get_categories().index(category)

    uncorrelated_components_projection = (category_projector.project(corpus)
                                          if category_projection is None
                                          else category_projection)

    distances = cosine_distances(uncorrelated_components_projection.get_category_embeddings().T)

    similarity_to_category_scores = -2 * (rankdata(distances[category_num]) - 0.5)

    uncorrelated_components = uncorrelated_components_projection.get_projection()

    least_correlated_dimension = min([(np.abs(pearsonr(similarity_to_category_scores,
                                                       uncorrelated_components.T[i])[0]), i)]
                                     for i in range(uncorrelated_components.shape[1]))[0][1]

    projection_to_plot = np.array([uncorrelated_components.T[least_correlated_dimension],
                                   similarity_to_category_scores]).T

    return produce_pairplot(
        corpus,
        initial_category=category,
        category_projection=uncorrelated_components_projection.use_alternate_projection(projection_to_plot),
        category_focused=True,
        **kwargs
    )

Python sklearn.metrics.pairwise.cosine_distances() Examples