Python sklearn.metrics.pairwise.cosine_distances() Examples
The following are 17
code examples of sklearn.metrics.pairwise.cosine_distances().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.metrics.pairwise
, or try the search function
.
Example #1
Source File: test_t_sne.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_tsne_with_different_distance_metrics(): """Make sure that TSNE works for different distance metrics""" random_state = check_random_state(0) n_components_original = 3 n_components_embedding = 2 X = random_state.randn(50, n_components_original).astype(np.float32) metrics = ['manhattan', 'cosine'] dist_funcs = [manhattan_distances, cosine_distances] for metric, dist_func in zip(metrics, dist_funcs): X_transformed_tsne = TSNE( metric=metric, n_components=n_components_embedding, random_state=0).fit_transform(X) X_transformed_tsne_precomputed = TSNE( metric='precomputed', n_components=n_components_embedding, random_state=0).fit_transform(dist_func(X)) assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)
Example #2
Source File: test_t_sne.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_tsne_with_different_distance_metrics(): """Make sure that TSNE works for different distance metrics""" random_state = check_random_state(0) n_components_original = 3 n_components_embedding = 2 X = random_state.randn(50, n_components_original).astype(np.float32) metrics = ['manhattan', 'cosine'] dist_funcs = [manhattan_distances, cosine_distances] for metric, dist_func in zip(metrics, dist_funcs): X_transformed_tsne = TSNE( metric=metric, n_components=n_components_embedding, random_state=0).fit_transform(X) X_transformed_tsne_precomputed = TSNE( metric='precomputed', n_components=n_components_embedding, random_state=0).fit_transform(dist_func(X)) assert_array_equal(X_transformed_tsne, X_transformed_tsne_precomputed)
Example #3
Source File: construct_hypergraph.py From DHGNN with MIT License | 6 votes |
def construct_H_with_KNN(X, K_neigs=[10], is_probH=False, m_prob=1): """ init multi-scale hypergraph Vertex-Edge matrix from original node feature matrix :param X: N_object x feature_number :param K_neigs: the number of neighbor expansion :param is_probH: prob Vertex-Edge matrix or binary :param m_prob: prob :return: N_object x N_hyperedge """ if len(X.shape) != 2: X = X.reshape(-1, X.shape[-1]) if type(K_neigs) == int: K_neigs = [K_neigs] dis_mat = cos_dis(X) H = None for k_neig in K_neigs: H_tmp = construct_H_with_KNN_from_distance(dis_mat, k_neig, is_probH, m_prob) H = hyperedge_concat(H, H_tmp) return H
Example #4
Source File: construct_hypergraph.py From DHGNN with MIT License | 5 votes |
def _construct_edge_list_from_distance(X, k_neigh): """ construct edge_list (numpy array) from kNN distance for single modality :param X -> numpy array: feature :param k_neigh -> int: # of neighbors :return: N * k_neigh numpy array """ dis = cos_dis(X) dis = torch.Tensor(dis) _, k_idx = dis.topk(k_neigh, dim=-1, largest=False) return k_idx.numpy()
Example #5
Source File: test_pairwise.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_cosine_distances(): # Check the pairwise Cosine distances computation rng = np.random.RandomState(1337) x = np.abs(rng.rand(910)) XA = np.vstack([x, x]) D = cosine_distances(XA) assert_array_almost_equal(D, [[0., 0.], [0., 0.]]) # check that all elements are in [0, 2] assert_true(np.all(D >= 0.)) assert_true(np.all(D <= 2.)) # check that diagonal elements are equal to 0 assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.]) XB = np.vstack([x, -x]) D2 = cosine_distances(XB) # check that all elements are in [0, 2] assert_true(np.all(D2 >= 0.)) assert_true(np.all(D2 <= 2.)) # check that diagonal elements are equal to 0 and non diagonal to 2 assert_array_almost_equal(D2, [[0., 2.], [2., 0.]]) # check large random matrix X = np.abs(rng.rand(1000, 5000)) D = cosine_distances(X) # check that diagonal elements are equal to 0 assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0]) assert_true(np.all(D >= 0.)) assert_true(np.all(D <= 2.)) # Paired distances
Example #6
Source File: ABuStatsUtil.py From abu with GNU General Public License v3.0 | 5 votes |
def cosine_distances_xy(x, y, to_similar=False): """ 余弦距离计算两个序列distance,注意需要理解数据的测距目的来分析 是否需要进行scale_start,进行和不进行scale_start的结果将完全不一样,在功能需求及数据理解的情况下 选择是否进行scale_start :param x: 可迭代序列 :param y: 可迭代序列 :param to_similar: 是否进行后置输出转换similar值 :return: float数值 """ distance = _distance_xy(cosine_distances, x, y) if to_similar: # 余弦距离转换余弦相似度直接减就行 distance = 1.0 - distance return distance
Example #7
Source File: baseline_tfidf.py From tg2019task with MIT License | 5 votes |
def main(): import argparse parser = argparse.ArgumentParser() parser.add_argument('-n', '--nearest', type=int, default=10) parser.add_argument('tables') parser.add_argument('questions', type=argparse.FileType('r', encoding='UTF-8')) args = parser.parse_args() explanations = [] for path, _, files in os.walk(args.tables): for file in files: explanations += read_explanations(os.path.join(path, file)) if not explanations: warnings.warn('Empty explanations') df_q = pd.read_csv(args.questions, sep='\t', dtype=str) df_e = pd.DataFrame(explanations, columns=('uid', 'text')) vectorizer = TfidfVectorizer().fit(df_q['Question']).fit(df_e['text']) X_q = vectorizer.transform(df_q['Question']) X_e = vectorizer.transform(df_e['text']) X_dist = cosine_distances(X_q, X_e) for i_question, distances in enumerate(X_dist): for i_explanation in np.argsort(distances)[:args.nearest]: print('{}\t{}'.format(df_q.loc[i_question]['questionID'], df_e.loc[i_explanation]['uid']))
Example #8
Source File: utils.py From GDAN with MIT License | 5 votes |
def kNN_classify(*, x, y): """ return the index of y that is closest to each x :param x: n*d matrix :param y: m*d matrix :return: n-dim vector """ ds = cosine_distances(x, y) idx = y[np.argmin(ds, axis=1)] return idx
Example #9
Source File: face_recognizer.py From celeb-detection-oss with Mozilla Public License 2.0 | 5 votes |
def _distance(x1, x2): return cosine_distances(x1, x2)
Example #10
Source File: test_cosine_distances.py From mars with Apache License 2.0 | 5 votes |
def testCosineDistancesExecution(self): raw_dense_x = np.random.rand(25, 10) raw_dense_y = np.random.rand(17, 10) raw_sparse_x = sps.random(25, 10, density=0.5, format='csr', random_state=0) raw_sparse_y = sps.random(17, 10, density=0.4, format='csr', random_state=1) for raw_x, raw_y in [ (raw_dense_x, raw_dense_y), (raw_sparse_x, raw_sparse_y) ]: for chunk_size in (25, 6): x = mt.tensor(raw_x, chunk_size=chunk_size) y = mt.tensor(raw_y, chunk_size=chunk_size) d = cosine_distances(x, y) result = self.executor.execute_tensor(d, concat=True)[0] expected = sk_cosine_distances(raw_x, raw_y) np.testing.assert_almost_equal(np.asarray(result), expected) d = cosine_distances(x) result = self.executor.execute_tensor(d, concat=True)[0] expected = sk_cosine_distances(raw_x) np.testing.assert_almost_equal(np.asarray(result), expected)
Example #11
Source File: metrics.py From chameleon_recsys with MIT License | 5 votes |
def cosine_distance(v1, v2): #As cosine similarity interval is [-1.0, 1.0], the cosine distance interval is [0.0, 2.0]. #This normalizes the cosine distance to interval [0.0, 1.0] return pairwise.cosine_distances(v1, v2) / 2.0 #For ranks index starting from 0
Example #12
Source File: firmware_clustering.py From Firmware_Slap with GNU General Public License v3.0 | 5 votes |
def get_cosine_dist(all_functions): return_dict = {} vect, func_sparse = funcs_to_sparse(all_functions) transformer = Normalizer().fit(func_sparse) func_sparse = transformer.transform(func_sparse) return cosine_distances(func_sparse, func_sparse)
Example #13
Source File: test_pairwise.py From Mastering-Elasticsearch-7.0 with MIT License | 5 votes |
def test_cosine_distances(): # Check the pairwise Cosine distances computation rng = np.random.RandomState(1337) x = np.abs(rng.rand(910)) XA = np.vstack([x, x]) D = cosine_distances(XA) assert_array_almost_equal(D, [[0., 0.], [0., 0.]]) # check that all elements are in [0, 2] assert np.all(D >= 0.) assert np.all(D <= 2.) # check that diagonal elements are equal to 0 assert_array_almost_equal(D[np.diag_indices_from(D)], [0., 0.]) XB = np.vstack([x, -x]) D2 = cosine_distances(XB) # check that all elements are in [0, 2] assert np.all(D2 >= 0.) assert np.all(D2 <= 2.) # check that diagonal elements are equal to 0 and non diagonal to 2 assert_array_almost_equal(D2, [[0., 2.], [2., 0.]]) # check large random matrix X = np.abs(rng.rand(1000, 5000)) D = cosine_distances(X) # check that diagonal elements are equal to 0 assert_array_almost_equal(D[np.diag_indices_from(D)], [0.] * D.shape[0]) assert np.all(D >= 0.) assert np.all(D <= 2.)
Example #14
Source File: vector_opr.py From tokenquery with GNU General Public License v3.0 | 5 votes |
def vec_cos_dist(token_input, operation_input): operation_string = None ref_vector_string = None cond_value_string = None for opr_sign in ['==', '>=', '<=', '!=', '<>', '<', '>', '=']: if opr_sign in operation_input: ref_vector_string = operation_input.split(opr_sign)[0] operation_string = opr_sign cond_value_string = operation_input.split(opr_sign)[1] break if ref_vector_string and cond_value_string and operation_string: try: cond_value = float(cond_value_string) ref_vector = change_string_to_vector(ref_vector_string) token_vector = change_string_to_vector(token_input) if len(ref_vector) != len(token_vector): print ('len of vectors does not match') return False if operation_string == "=" or operation_string == "==": return cosine_distances(token_vector, ref_vector) == cond_value elif operation_string == "<": return cosine_distances(token_vector, ref_vector) < cond_value elif operation_string == ">": return cosine_distances(token_vector, ref_vector) > cond_value elif operation_string == ">=": return cosine_distances(token_vector, ref_vector) >= cond_value elif operation_string == "<=": return cosine_distances(token_vector, ref_vector) <= cond_value elif operation_string == "!=" or operation_string == "<>": return cosine_distances(token_vector, ref_vector) != cond_value else: return False except ValueError: # TODO raise tokenregex error return False else: # TODO raise tokenregex error print ('Problem with the operation input')
Example #15
Source File: feature_engineering.py From coling2018_fake-news-challenge with Apache License 2.0 | 4 votes |
def latent_dirichlet_allocation(headlines, bodies): # https://pypi.python.org/pypi/lda on bottom see suggestions like MALLET, hca # https://medium.com/@aneesha/topic-modeling-with-scikit-learn-e80d33668730 # https://www.quora.com/What-are-the-best-features-to-put-into-Latent-Dirichlet-Allocation-LDA-for-topic-modeling-of-short-text def print_top_words(model, feature_names, n_top_words): for topic_idx, topic in enumerate(model.components_): print("Topic #%d:" % topic_idx) print(", ".join([feature_names[i] for i in topic.argsort()[:-n_top_words - 1:-1]])) print() def combine_head_and_body(headlines, bodies): head_and_body = [headline + " " + body for i, (headline, body) in enumerate(zip(headlines, bodies))] return head_and_body def get_features(vocab): vectorizer_head = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_head = vectorizer_head.fit_transform(headlines) vectorizer_body = TfidfVectorizer(vocabulary=vocab, use_idf=False, norm='l2') X_train_body = vectorizer_body.fit_transform(bodies) # calculates n most important topics of the bodies. Each topic contains all words but ordered by importance. The # more important topic words a body contains of a certain topic, the higher its value for this topic lda_body = LatentDirichletAllocation(n_topics=25, learning_method='online', random_state=0, n_jobs=3) print("latent_dirichlet_allocation: fit and transform body") t0 = time() lda_body_matrix = lda_body.fit_transform(X_train_body) print("done in %0.3fs." % (time() - t0)) print("latent_dirichlet_allocation: transform head") # use the lda trained for body topcis on the headlines => if the headlines and bodies share topics # their vectors should be similar lda_head_matrix = lda_body.transform(X_train_head) #print_top_words(lda_body, vectorizer_body.get_feature_names(), 100) print('latent_dirichlet_allocation: calculating cosine distance between head and body') # calculate cosine distance between the body and head X = [] for i in range(len(lda_head_matrix)): X_head_vector = np.array(lda_head_matrix[i]).reshape((1, -1)) #1d array is deprecated X_body_vector = np.array(lda_body_matrix[i]).reshape((1, -1)) cos_dist = cosine_distances(X_head_vector, X_body_vector).flatten() X.append(cos_dist.tolist()) return X vocab = create_word_ngram_vocabulary(ngram_range=(1, 1), max_features=5000, lemmatize=False, term_freq=True, norm='l2') X = get_features(vocab) return X
Example #16
Source File: ABuStatsUtil.py From abu with GNU General Public License v3.0 | 4 votes |
def cosine_distance_matrix(df, scale_end=True, to_similar=False): """ 余弦距离: 与cosine_distances_xy的区别主要是,非两两distance计算,只有一个矩阵的输入, 且输入必须为pd.DataFrame or np.array or 多层迭代序列[[],[]],注意需要理解数据的测距目的来分析 是否需要进行scale_start,进行和不进行scale_start的结果将完全不一样,在功能需求及数据理解的情况下 选择是否进行scale_start eg: input: tsla bidu noah sfun goog vips aapl 2014-07-25 223.57 226.50 15.32 12.110 589.02 21.349 97.67 2014-07-28 224.82 225.80 16.13 12.450 590.60 21.548 99.02 2014-07-29 225.01 220.00 16.75 12.220 585.61 21.190 98.38 ... ... ... ... ... ... ... ... 2016-07-22 222.27 160.88 25.50 4.850 742.74 13.510 98.66 2016-07-25 230.01 160.25 25.57 4.790 739.77 13.390 97.34 2016-07-26 225.93 163.09 24.75 4.945 740.92 13.655 97.76 ABuStatsUtil.cosine_distance_matrix(cc, scale_start=True) output: tsla bidu noah sfun goog vips aapl tsla 0.0000 0.1743 0.4434 0.2945 0.2394 0.4763 0.1266 bidu 0.1743 0.0000 0.5808 0.2385 0.3986 0.3034 0.1470 noah 0.4434 0.5808 0.0000 1.0000 0.3411 0.7626 0.2632 sfun 0.2945 0.2385 1.0000 0.0000 0.7494 0.4448 0.4590 goog 0.2394 0.3986 0.3411 0.7494 0.0000 0.9717 0.2806 vips 0.4763 0.3034 0.7626 0.4448 0.9717 0.0000 0.2669 aapl 0.1266 0.1470 0.2632 0.4590 0.2806 0.2669 0.0000 ABuStatsUtil.cosine_distance_matrix(cc, scale_start=False) output: tsla bidu noah sfun goog vips aapl tsla 0.0000 0.1743 0.4434 0.2945 0.2394 0.4763 0.1266 bidu 0.1743 0.0000 0.5808 0.2385 0.3986 0.3034 0.1470 noah 0.4434 0.5808 0.0000 1.0000 0.3411 0.7626 0.2632 sfun 0.2945 0.2385 1.0000 0.0000 0.7494 0.4448 0.4590 goog 0.2394 0.3986 0.3411 0.7494 0.0000 0.9717 0.2806 vips 0.4763 0.3034 0.7626 0.4448 0.9717 0.0000 0.2669 aapl 0.1266 0.1470 0.2632 0.4590 0.2806 0.2669 0.0000 :param df: pd.DataFrame or np.array or 多层迭代序列[[],[]], 之所以叫df,是因为在内部会统一转换为pd.DataFrame :param scale_end: 对结果矩阵进行标准化处理 :param to_similar: 是否进行后置输出转换similar值 :return: distance_df,pd.DataFrame对象 """ return _distance_matrix(cosine_distances, df, scale_end, to_similar) # TODO distance与similar之间的关系以及模块功能位置
Example #17
Source File: pairplot.py From scattertext with Apache License 2.0 | 4 votes |
def produce_category_focused_pairplot(corpus, category, category_projector=CategoryProjector(projector=TruncatedSVD(20)), category_projection=None, **kwargs): ''' Produces a pair-plot which is focused on a single category. :param corpus: TermDocMatrix :param category: str, name of a category in the corpus :param category_projector: CategoryProjector, a factor analysis of the category/feature vector :param category_projection: CategoryProjection, None by default. If present, overrides category projector :param kwargs: remaining kwargs for produce_pairplot :return: str, HTML ''' category_num = corpus.get_categories().index(category) uncorrelated_components_projection = (category_projector.project(corpus) if category_projection is None else category_projection) distances = cosine_distances(uncorrelated_components_projection.get_category_embeddings().T) similarity_to_category_scores = -2 * (rankdata(distances[category_num]) - 0.5) uncorrelated_components = uncorrelated_components_projection.get_projection() least_correlated_dimension = min([(np.abs(pearsonr(similarity_to_category_scores, uncorrelated_components.T[i])[0]), i)] for i in range(uncorrelated_components.shape[1]))[0][1] projection_to_plot = np.array([uncorrelated_components.T[least_correlated_dimension], similarity_to_category_scores]).T return produce_pairplot( corpus, initial_category=category, category_projection=uncorrelated_components_projection.use_alternate_projection(projection_to_plot), category_focused=True, **kwargs )