Python sklearn.metrics.pairwise.cosine_similarity() Examples
The following are 30
code examples of sklearn.metrics.pairwise.cosine_similarity().
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
You may also want to check out all available functions/classes of the module
sklearn.metrics.pairwise
, or try the search function
.
Example #1
Source File: test_pairwise.py From twitter-stock-recommendation with MIT License | 6 votes |
def test_cosine_similarity(): # Test the cosine_similarity. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((3, 4)) Xcsr = csr_matrix(X) Ycsr = csr_matrix(Y) for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): # Test that the cosine is kernel is equal to a linear kernel when data # has been previously normalized by L2-norm. K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") X_ = normalize(X_) if Y_ is not None: Y_ = normalize(Y_) K2 = pairwise_kernels(X_, Y=Y_, metric="linear") assert_array_almost_equal(K1, K2)
Example #2
Source File: utility.py From DeepLearn with MIT License | 6 votes |
def cos_sim(ind1,ind2=1999): view1 = np.load("test_v1.npy")[0:ind1] view2 = np.load("test_v2.npy")[0:ind2] #val = [] MAP=0 for i,j in enumerate(view1): val=[] AP=0 for x in view2: val.append(cosine_similarity(j,x)[0].tolist()) #val=val[0].tolist() #print val[0].tolist() val=[(q,p)for p,q in enumerate(val)] #print val val.sort() val.reverse() t = [w[1]for w in val[0:7]] for x,y in enumerate(t): if y in range(i,i+5): AP+=1/(x+1) print(t) print(AP) MAP+=AP print('MAP is : ',MAP/ind1)
Example #3
Source File: entity_discoverer.py From HarvestText with MIT License | 6 votes |
def clustering(self, threshold): """分不同词性的聚类 :return: partition: dict {word_id: cluster_id} """ print("Louvain clustering") partition = {} part_offset = 0 for etype, ners in self.type_entity_dict.items(): sub_id_mapping = [self.word2id[ner0] for ner0 in ners if ner0 in self.word2id] if len(sub_id_mapping) == 0: continue emb_mat_sub = self.emb_mat[sub_id_mapping, :] cos_sims = cosine_similarity(emb_mat_sub) cos_sims -= np.eye(len(emb_mat_sub)) adj_mat = (cos_sims > threshold).astype(int) G = nx.from_numpy_array(adj_mat) partition_sub = community.best_partition(G) for sub_id, main_id in enumerate(sub_id_mapping): sub_part_id = partition_sub[sub_id] partition[main_id] = sub_part_id + part_offset part_offset += max(partition_sub.values()) + 1 return partition
Example #4
Source File: test_pairwise.py From Mastering-Elasticsearch-7.0 with MIT License | 6 votes |
def test_cosine_similarity(): # Test the cosine_similarity. rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) Y = rng.random_sample((3, 4)) Xcsr = csr_matrix(X) Ycsr = csr_matrix(Y) for X_, Y_ in ((X, None), (X, Y), (Xcsr, None), (Xcsr, Ycsr)): # Test that the cosine is kernel is equal to a linear kernel when data # has been previously normalized by L2-norm. K1 = pairwise_kernels(X_, Y=Y_, metric="cosine") X_ = normalize(X_) if Y_ is not None: Y_ = normalize(Y_) K2 = pairwise_kernels(X_, Y=Y_, metric="linear") assert_array_almost_equal(K1, K2)
Example #5
Source File: text_embedding_similarity_transformers.py From driverlessai-recipes with Apache License 2.0 | 6 votes |
def transform(self, X: dt.Frame): X.replace([None, math.inf, -math.inf], self._repl_val) from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence if self.embedding_name in ["glove", "en"]: self.embedding = WordEmbeddings(self.embedding_name) elif self.embedding_name in ["bert"]: self.embedding = BertEmbeddings() self.doc_embedding = DocumentPoolEmbeddings([self.embedding]) output = [] X = X.to_pandas() text1_arr = X.iloc[:, 0].values text2_arr = X.iloc[:, 1].values for ind, text1 in enumerate(text1_arr): try: text1 = Sentence(str(text1).lower()) self.doc_embedding.embed(text1) text2 = text2_arr[ind] text2 = Sentence(str(text2).lower()) self.doc_embedding.embed(text2) score = cosine_similarity(text1.get_embedding().reshape(1, -1), text2.get_embedding().reshape(1, -1))[0, 0] output.append(score) except: output.append(-99) return np.array(output)
Example #6
Source File: faceApi.py From FaceRecognition-RestApi with MIT License | 6 votes |
def compared(request): if request.method == 'POST': if len(request.FILES) != 2: return HttpResponse('{"status":false,"data":"","msg":"图片参数错误!"}') starttime = time.time() name1 = str(random.randint(10000, 99999)) + str(time.time()) # 随机名字 name2 = str(random.randint(10000, 99999)) + str(time.time()) handle_uploaded_file(request.FILES['face1'], str(name1)) handle_uploaded_file(request.FILES['face2'], str(name2)) tz1 = get_feature(root + "RestServer/upload/" + str(name1)) tz2 = get_feature(root + "RestServer/upload/" + str(name2)) comparedValue = pw.cosine_similarity(tz1, tz2)[0][0] os.remove(root + "RestServer/upload/" + str(name1)) os.remove(root + "RestServer/upload/" + str(name2)) endtime = time.time() Runtime=endtime-starttime return HttpResponse('{"status":true,"data":"' + str(comparedValue) + '","msg":"成功","runtime": ' + str(Runtime) + ' }') else: return HttpResponse('{"status":false,"data":"","msg":"请求不合法"}') return HttpResponse('{"status":false,"data":"","msg":"未知错误"}')
Example #7
Source File: app.py From altair with Apache License 2.0 | 6 votes |
def get_closest_docs(uri): #user_doc = requests.get(uri).text r = requests.get(uri) if r.status_code == 200: user_doc = r.text print("URI content length",len(user_doc)) code, _ = separate_code_and_comments(user_doc,"user doc") normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True) model.random.seed(0) user_vector = model.infer_vector(normalized_code) print("finding similar...") sys.stdout.flush() stored_urls = list() stored_vectors = list() for url in vectors: stored_urls.append(url) stored_vectors.append(vectors[url]) pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors) indices = (-pair_sims[0]).argsort()[:5] return [(stored_urls[index],round(float(pair_sims[0][index]),2)) for index in indices] else: print("URL returned status code", r.status_code) raise ValueError('URL error')
Example #8
Source File: helpers.py From fnc-1 with Apache License 2.0 | 6 votes |
def cosine_sim(x, y): try: if type(x) is np.ndarray: x = x.reshape(1, -1) # get rid of the warning if type(y) is np.ndarray: y = y.reshape(1, -1) d = cosine_similarity(x, y) d = d[0][0] except: print x print y d = 0. return d # Copyright 2017 Cisco Systems, Inc. # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License.
Example #9
Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0 | 6 votes |
def _get_similarity_values(self, q1_csc, q2_csc): cosine_sim = [] manhattan_dis = [] eucledian_dis = [] jaccard_dis = [] minkowsk_dis = [] for i,j in zip(q1_csc, q2_csc): sim = cs(i, j) cosine_sim.append(sim[0][0]) sim = md(i, j) manhattan_dis.append(sim[0][0]) sim = ed(i, j) eucledian_dis.append(sim[0][0]) i_ = i.toarray() j_ = j.toarray() try: sim = jsc(i_, j_) jaccard_dis.append(sim) except: jaccard_dis.append(0) sim = minkowski_dis.pairwise(i_, j_) minkowsk_dis.append(sim[0][0]) return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis
Example #10
Source File: itemitem.py From Hands-on-Supervised-Machine-Learning-with-Python with MIT License | 6 votes |
def _compute_sim(self, R, k): # compute the similarity between all the items. This calculates the # similarity between each ITEM sim = cosine_similarity(R.T) # Only keep the similarities of the top K, setting all others to zero # (negative since we want descending) not_top_k = np.argsort(-sim, axis=1)[:, k:] # shape=(n_items, k) if not_top_k.shape[1]: # only if there are cols (k < n_items) # now we have to set these to zero in the similarity matrix row_indices = np.repeat(range(not_top_k.shape[0]), not_top_k.shape[1]) sim[row_indices, not_top_k.ravel()] = 0. return sim
Example #11
Source File: saxvsm.py From pyts with BSD 3-Clause "New" or "Revised" License | 6 votes |
def decision_function(self, X): """Evaluate the cosine similarity between document-term matrix and X. Parameters ---------- X : array-like, shape (n_samples, n_timestamps) Test samples. Returns ------- X : array-like, shape (n_samples, n_classes) osine similarity between the document-term matrix and X. """ check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_', '_tfidf', 'classes_']) X = check_array(X) X_bow = self._bow.transform(X) vectorizer = CountVectorizer(vocabulary=self._tfidf.vocabulary_) X_transformed = vectorizer.transform(X_bow).toarray() return cosine_similarity(X_transformed, self.tfidf_)
Example #12
Source File: test_skater.py From region with BSD 3-Clause "New" or "Revised" License | 6 votes |
def test_init(): default = Spanning_Forest() assert default.metric == skm.manhattan_distances assert default.center == np.mean assert default.reduction == np.sum change = Spanning_Forest(dissimilarity=skm.euclidean_distances, center=np.median, reduction=np.max) assert change.metric == skm.euclidean_distances assert change.center == np.median assert change.reduction == np.max sym = Spanning_Forest(affinity=skm.cosine_similarity) assert isinstance(sym.metric, types.LambdaType) test_distance = -np.log(skm.cosine_similarity(data[:2,])) comparator = sym.metric(data[:2,]) np.testing.assert_allclose(test_distance, comparator)
Example #13
Source File: save_utils.py From keras-glove with MIT License | 6 votes |
def save_model(model: Model, tokenizer: Tokenizer): """ Saves the important parts of the model :param model: Keras model to save :param tokenizer: Keras Tokenizer to save """ for layer in model.layers: if '_biases' in layer.name or '_embeddings' in layer.name: np.save(file=f'{OUTPUT_FOLDER}{layer.name}', arr=layer.get_weights()[0]) # save tokenizer pickle.dump(obj=tokenizer.index_word, file=open(f'{OUTPUT_FOLDER}{INDEX2WORD}', 'wb')) pickle.dump(obj=tokenizer.word_index, file=open(f'{OUTPUT_FOLDER}{WORD2INDEX}', 'wb')) # save combined embeddings & correlation matrix agg_embeddings = np.load(f'{OUTPUT_FOLDER}{CENTRAL_EMBEDDINGS}.npy') + \ np.load(f'{OUTPUT_FOLDER}{CONTEXT_EMBEDDINGS}.npy') np.save(file=f'{OUTPUT_FOLDER}{AGGREGATED_EMBEDDINGS}', arr=agg_embeddings) np.save(file=f'{OUTPUT_FOLDER}{CORRELATION_MATRIX}', arr=cosine_similarity(cosine_similarity(agg_embeddings)))
Example #14
Source File: test_pairwise.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_kernel_symmetry(): # Valid kernels should be symmetric rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) for kernel in (linear_kernel, polynomial_kernel, rbf_kernel, laplacian_kernel, sigmoid_kernel, cosine_similarity): K = kernel(X, X) assert_array_almost_equal(K, K.T, 15)
Example #15
Source File: face_recognition.py From FindFaceInVideo with BSD 2-Clause "Simplified" License | 5 votes |
def compare_pic(feature1, feature2): predicts = pw.cosine_similarity(feature1, feature2); return predicts;
Example #16
Source File: test_bert_sentence_encoding.py From nlp-recipes with MIT License | 5 votes |
def test_sentence_encoding(tmp, data): se = BERTSentenceEncoder( language=Language.ENGLISH, num_gpus=0, to_lower=True, max_len=128, layer_index=-2, pooling_strategy=PoolingStrategy.MEAN, cache_dir=tmp, ) result = se.encode(data, as_numpy=False) similarity = cosine_similarity(result["values"].values.tolist()) assert similarity[0, 0] > similarity[1, 0] assert similarity[0, 1] > similarity[0, 2]
Example #17
Source File: test_pairwise.py From twitter-stock-recommendation with MIT License | 5 votes |
def test_kernel_sparse(): rng = np.random.RandomState(0) X = rng.random_sample((5, 4)) X_sparse = csr_matrix(X) for kernel in (linear_kernel, polynomial_kernel, rbf_kernel, laplacian_kernel, sigmoid_kernel, cosine_similarity): K = kernel(X, X) K2 = kernel(X_sparse, X_sparse) assert_array_almost_equal(K, K2)
Example #18
Source File: inltk.py From inltk with MIT License | 5 votes |
def get_similar_sentences(sen: str, no_of_variations: int, language_code: str, degree_of_aug: float = 0.1): check_input_language(language_code) # get embedding vectors for sen tok = LanguageTokenizer(language_code) token_ids = tok.numericalize(sen) embedding_vectors = get_embedding_vectors(sen, language_code) # get learner defaults.device = torch.device('cpu') path = Path(__file__).parent learn = load_learner(path / 'models' / f'{language_code}') encoder = get_model(learn.model)[0] encoder.reset() embeddings = encoder.state_dict()['encoder.weight'] embeddings = np.array(embeddings) # cos similarity of vectors scores = cosine_similarity(embedding_vectors,embeddings) word_ids = [np.argpartition(-np.array(score), no_of_variations+1)[:no_of_variations+1] for score in scores] word_ids = [ids.tolist() for ids in word_ids] for i, ids in enumerate(word_ids): word_ids[i] = [wid for wid in word_ids[i] if wid != token_ids[i]] # generating more variations than required so that we can then filter out the best ones buffer_multiplicity = 2 new_sen_tokens = [] for i in range(no_of_variations): for k in range(buffer_multiplicity): new_token_ids = [] ids = sorted(random.sample(range(len(token_ids)), max(1, int(degree_of_aug * len(token_ids))))) for j in range(len(token_ids)): if j in ids: new_token_ids.append(word_ids[j][(i + k) % len(word_ids[j])]) else: new_token_ids.append(token_ids[j]) new_token_ids = list(map(lambda x: int(x), new_token_ids)) new_sen_tokens.append(new_token_ids) new_sens = [tok.textify(sen_tokens) for sen_tokens in new_sen_tokens] while sen in new_sens: new_sens.remove(sen) sen_with_sim_score = [(new_sen, get_sentence_similarity(sen, new_sen, language_code)) for new_sen in new_sens] sen_with_sim_score.sort(key=lambda x: x[1], reverse=True) new_sens = [sen for sen, _ in sen_with_sim_score] return new_sens[:no_of_variations]
Example #19
Source File: bossvs.py From pyts with BSD 3-Clause "New" or "Revised" License | 5 votes |
def decision_function(self, X): """Evaluate the cosine similarity between document-term matrix and X. Parameters ---------- X : array-like, shape (n_samples, n_timestamps) Test samples. Returns ------- X : array, shape (n_samples, n_classes) Cosine similarity between the document-term matrix and X. """ check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_', '_tfidf']) X = check_array(X, dtype='float64') n_samples, n_timestamps = X.shape X_windowed = _windowed_view( X, n_samples, n_timestamps, self._window_size, self._window_step ) X_windowed = X_windowed.reshape(-1, self._window_size) X_sfa = self._sfa.transform(X_windowed) X_word = np.asarray([''.join(X_sfa[i]) for i in range(X_sfa.shape[0])]) X_word = X_word.reshape(n_samples, self._n_windows) if self.numerosity_reduction: not_equal = np.c_[X_word[:, 1:] != X_word[:, :-1], np.full(n_samples, True)] X_bow = np.asarray([' '.join(X_word[i, not_equal[i]]) for i in range(n_samples)]) else: X_bow = np.asarray([' '.join(X_word[i]) for i in range(n_samples)]) X_tf = self._tfidf.transform(X_bow).toarray() if self.idf_ is not None: X_tf /= self.idf_ return cosine_similarity(X_tf, self.tfidf_)
Example #20
Source File: qmath.py From RecQ with GNU General Public License v3.0 | 5 votes |
def cosine(x1,x2): #find common ratings #new_x1, new_x2 = common(x1,x2) #compute the cosine similarity between two vectors sum = x1.dot(x2) denom = sqrt(x1.dot(x1)*x2.dot(x2)) try: return float(sum)/denom except ZeroDivisionError: return 0 #return cosine_similarity(x1,x2)[0][0]
Example #21
Source File: fever_features.py From fever-naacl-2018 with Apache License 2.0 | 5 votes |
def process(self,data): claim_bow = self.bow_vectorizer.transform(self.claims(data)) claim_tfs = self.tfreq_vectorizer.transform(claim_bow) claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data)) body_texts = self.texts(data) body_bow = self.bow_vectorizer.transform(body_texts) body_tfs = self.tfreq_vectorizer.transform(body_bow) body_tfidf = self.tfidf_vectorizer.transform(body_texts) cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)]) return hstack([body_tfs,claim_tfs,cosines])
Example #22
Source File: process_tfidf_grid.py From fever-naacl-2018 with Apache License 2.0 | 5 votes |
def process(self, data): claim_bow = self.bow_vectorizer.transform(self.claims(data)) claim_tfs = self.tfreq_vectorizer.transform(claim_bow) claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data)) body_texts = self.texts(data) body_bow = self.bow_vectorizer.transform(body_texts) body_tfs = self.tfreq_vectorizer.transform(body_bow) body_tfidf = self.tfidf_vectorizer.transform(body_texts) cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)]) return cosines
Example #23
Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0 | 5 votes |
def _create_weighted_distance_features(self, df): q1_matrix = self.tfidf_vectorizer.transform(df['spn_1'].values.tolist()) q2_matrix = self.tfidf_vectorizer.transform(df['spn_2'].values.tolist()) df['weighted_cosine_sim'] = np.concatenate([cs(q1_matrix[i], q2_matrix[i]).flatten() for i in range(q1_matrix.shape[0])]) #df['weighted_eucledian_dis'] = np.square((q1_matrix - q2_matrix).toarray()).sum(axis=1)
Example #24
Source File: scorer.py From entity2vec with Apache License 2.0 | 5 votes |
def similarity_function(vec1,vec2, similarity): #compute cosine similarity or other similarities v1 = np.array(vec1) v2 = np.array(vec2) if len(v1)*len(v2) == 0: #any of the two is 0 global count count +=1 return 0 else: if similarity == 'cosine': return cosine_similarity([v1],[v2])[0][0] #returns a double array [[sim]] elif similarity == 'softmax': return np.exp(np.dot(v1,v2)) #normalization is useless for relative comparisons elif similarity == 'linear_kernel': return linear_kernel(v1,v2)[0][0] elif similarity == 'euclidean': return euclidean_distances(v1,v2)[0][0] else: raise NameError('Choose a valid similarity function')
Example #25
Source File: spine_sample.py From ikelos with MIT License | 5 votes |
def on_epoch_end(self, epoch, logs={}): indices = np.random.choice(len(self.spine_embedder), 10, False) comparisons = cosine_similarity(self.spine_embedder[indices], self.spine_embedder) results = np.argmax(comparisons, axis=-1) spine_vocab = self.igor.vocabs.spines comp_spines = [spine_vocab.lookup(x) for x in results] in_spines = [spine_vocab.lookup(x) for x in indices] for spine_i, spine_j in zip(in_spines, comp_spines): print("SPINE: {}".format(self.decode(spine_i))) print("\t most similar to {}".format(self.decode(spine_j)))
Example #26
Source File: dist_utils.py From kaggle-HomeDepot with MIT License | 5 votes |
def _cosine_sim(vec1, vec2): try: s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0] except: try: s = cosine_similarity(vec1, vec2)[0][0] except: s = config.MISSING_VALUE_NUMERIC return s
Example #27
Source File: tcga_benchmark.py From perfect_match with MIT License | 5 votes |
def get_centroid_weights(self, x): similarities = map(lambda indices, centroid: cosine_similarity(x[indices].reshape(1, -1), centroid.reshape(1, -1)), map(lambda x: x[0], self.centroids), map(lambda x: x[1], self.centroids)) return np.squeeze(similarities)
Example #28
Source File: twins_benchmark.py From perfect_match with MIT License | 5 votes |
def get_centroid_weights(self, x): similarities = map( lambda centroid: cosine_similarity(self.data_access.standardise_entry( np.array(x[7:], dtype="float32") ).reshape((1, -1)), centroid.reshape((1, -1))), map(lambda x: x[0], self.centroids) ) return np.squeeze(similarities)
Example #29
Source File: feature_axis.py From transparent_latent_gan with MIT License | 5 votes |
def plot_feature_cos_sim(feature_direction, feature_name=None): """ plot cosine similarity measure of vectors :param feature_direction: vectors, shape = (num_dimension, num_vector) :param feature_name: list of names of features :return: cosines similarity matrix, shape = (num_vector, num_vector) """ import matplotlib.pyplot as plt from sklearn.metrics.pairwise import cosine_similarity len_z, len_y = feature_direction.shape if feature_name is None: feature_name = range(len_y) feature_cos_sim = cosine_similarity(feature_direction.transpose()) c_lim_abs = np.max(np.abs(feature_cos_sim)) plt.pcolormesh(np.arange(len_y+1), np.arange(len_y+1), feature_cos_sim, vmin=-c_lim_abs, vmax=+c_lim_abs, cmap='coolwarm') plt.gca().invert_yaxis() plt.colorbar() # plt.axis('square') plt.xticks(np.arange(len_y) + 0.5, feature_name, fontsize='x-small', rotation='vertical') plt.yticks(np.arange(len_y) + 0.5, feature_name, fontsize='x-small') plt.show() return feature_cos_sim
Example #30
Source File: test_saxvsm.py From pyts with BSD 3-Clause "New" or "Revised" License | 5 votes |
def test_actual_results_strategy_uniform(): """Test that the actual results are the expected ones.""" # Data X = [[0, 0, 0, 1, 0, 0, 1, 1, 1], [0, 1, 1, 1, 0, 0, 1, 1, 1], [0, 0, 0, 1, 0, 0, 0, 1, 0]] y = [0, 0, 1] clf = SAXVSM(window_size=4, word_size=4, n_bins=2, strategy='uniform', numerosity_reduction=False, sublinear_tf=False) decision_function_actual = clf.fit(X, y).decision_function(X) # X_bow = ["aaab aaba abaa baab aabb abbb", # "abbb bbba bbaa baab aabb abbb", # "aaab aaba abaa baaa aaab aaba"] assert clf.vocabulary_ == {0: 'aaab', 1: 'aaba', 2: 'aabb', 3: 'abaa', 4: 'abbb', 5: 'baaa', 6: 'baab', 7: 'bbaa', 8: 'bbba'} freq = np.asarray([[1, 1, 1, 1, 1, 0, 1, 0, 0], [0, 0, 1, 0, 2, 0, 1, 1, 1], [2, 2, 0, 1, 0, 1, 0, 0, 0]]) tf = np.asarray([[1, 1, 2, 1, 3, 0, 2, 1, 1], [2, 2, 0, 1, 0, 1, 0, 0, 0]]) idf = np.asarray([1, 1, log(2) + 1, 1, log(2) + 1, log(2) + 1, log(2) + 1, log(2) + 1, log(2) + 1]) decision_function_desired = cosine_similarity(freq, tf * idf[None, :]) np.testing.assert_allclose(decision_function_actual, decision_function_desired, atol=1e-5, rtol=0.) pred_actual = clf.predict(X) pred_desired = cosine_similarity(freq, tf * idf[None, :]).argmax(axis=1) np.testing.assert_array_equal(pred_actual, pred_desired)