Python Examples of sklearn.metrics.pairwise.cosine

Source File: test_pairwise.py From twitter-stock-recommendation with MIT License

6 votes

def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2)

Source File: utility.py From DeepLearn with MIT License

6 votes

def cos_sim(ind1,ind2=1999):
    view1 = np.load("test_v1.npy")[0:ind1]
    view2 = np.load("test_v2.npy")[0:ind2]
    #val = []
    MAP=0
    for i,j in enumerate(view1):
        val=[]
        AP=0
        for x in view2:            
            val.append(cosine_similarity(j,x)[0].tolist())
        #val=val[0].tolist()
        #print val[0].tolist()
        val=[(q,p)for p,q in enumerate(val)]
        #print val
        val.sort()
        val.reverse()
        t = [w[1]for w in val[0:7]]
        for x,y in enumerate(t):
            if y in range(i,i+5):
                AP+=1/(x+1)
        print(t)
        print(AP)
        MAP+=AP
    print('MAP is : ',MAP/ind1)

Source File: entity_discoverer.py From HarvestText with MIT License

6 votes

def clustering(self, threshold):
        """分不同词性的聚类

        :return: partition: dict {word_id: cluster_id}
        """
        print("Louvain clustering")
        partition = {}
        part_offset = 0
        for etype, ners in self.type_entity_dict.items():
            sub_id_mapping = [self.word2id[ner0] for ner0 in ners if ner0 in self.word2id]
            if len(sub_id_mapping) == 0:
                continue
            emb_mat_sub = self.emb_mat[sub_id_mapping, :]
            cos_sims = cosine_similarity(emb_mat_sub)
            cos_sims -= np.eye(len(emb_mat_sub))
            adj_mat = (cos_sims > threshold).astype(int)
            G = nx.from_numpy_array(adj_mat)
            partition_sub = community.best_partition(G)
            for sub_id, main_id in enumerate(sub_id_mapping):
                sub_part_id = partition_sub[sub_id]
                partition[main_id] = sub_part_id + part_offset
            part_offset += max(partition_sub.values()) + 1
        return partition

Source File: test_pairwise.py From Mastering-Elasticsearch-7.0 with MIT License

6 votes

def test_cosine_similarity():
    # Test the cosine_similarity.

    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    Y = rng.random_sample((3, 4))
    Xcsr = csr_matrix(X)
    Ycsr = csr_matrix(Y)

    for X_, Y_ in ((X, None), (X, Y),
                   (Xcsr, None), (Xcsr, Ycsr)):
        # Test that the cosine is kernel is equal to a linear kernel when data
        # has been previously normalized by L2-norm.
        K1 = pairwise_kernels(X_, Y=Y_, metric="cosine")
        X_ = normalize(X_)
        if Y_ is not None:
            Y_ = normalize(Y_)
        K2 = pairwise_kernels(X_, Y=Y_, metric="linear")
        assert_array_almost_equal(K1, K2)

Source File: text_embedding_similarity_transformers.py From driverlessai-recipes with Apache License 2.0

6 votes

def transform(self, X: dt.Frame):
        X.replace([None, math.inf, -math.inf], self._repl_val)
        from flair.embeddings import WordEmbeddings, BertEmbeddings, DocumentPoolEmbeddings, Sentence
        if self.embedding_name in ["glove", "en"]:
            self.embedding = WordEmbeddings(self.embedding_name)
        elif self.embedding_name in ["bert"]:
            self.embedding = BertEmbeddings()
        self.doc_embedding = DocumentPoolEmbeddings([self.embedding])
        output = []
        X = X.to_pandas()
        text1_arr = X.iloc[:, 0].values
        text2_arr = X.iloc[:, 1].values
        for ind, text1 in enumerate(text1_arr):
            try:
                text1 = Sentence(str(text1).lower())
                self.doc_embedding.embed(text1)
                text2 = text2_arr[ind]
                text2 = Sentence(str(text2).lower())
                self.doc_embedding.embed(text2)
                score = cosine_similarity(text1.get_embedding().reshape(1, -1),
                                          text2.get_embedding().reshape(1, -1))[0, 0]
                output.append(score)
            except:
                output.append(-99)
        return np.array(output)

Source File: faceApi.py From FaceRecognition-RestApi with MIT License

6 votes

def compared(request):
    if request.method == 'POST':
        if len(request.FILES) != 2:
            return HttpResponse('{"status":false,"data":"","msg":"图片参数错误！"}')
        starttime = time.time()
        name1 = str(random.randint(10000, 99999)) + str(time.time())  # 随机名字
        name2 = str(random.randint(10000, 99999)) + str(time.time())

        handle_uploaded_file(request.FILES['face1'], str(name1))
        handle_uploaded_file(request.FILES['face2'], str(name2))

        tz1 = get_feature(root + "RestServer/upload/" + str(name1))

        tz2 = get_feature(root + "RestServer/upload/" + str(name2))

        comparedValue = pw.cosine_similarity(tz1, tz2)[0][0]

        os.remove(root + "RestServer/upload/" + str(name1))
        os.remove(root + "RestServer/upload/" + str(name2))
        endtime = time.time()
        Runtime=endtime-starttime
        return HttpResponse('{"status":true,"data":"' + str(comparedValue) + '","msg":"成功","runtime": ' + str(Runtime) + '  }')
    else:
        return HttpResponse('{"status":false,"data":"","msg":"请求不合法"}')
    return HttpResponse('{"status":false,"data":"","msg":"未知错误"}')

Source File: app.py From altair with Apache License 2.0

6 votes

def get_closest_docs(uri):
    #user_doc = requests.get(uri).text
    r = requests.get(uri)
    if r.status_code == 200:
        user_doc = r.text
        print("URI content length",len(user_doc))
        code, _ = separate_code_and_comments(user_doc,"user doc")
        normalized_code = normalize_text(code, remove_stop_words=False, only_letters=False, return_list=True)
        model.random.seed(0)
        user_vector = model.infer_vector(normalized_code)
        print("finding similar...")
        sys.stdout.flush()
        stored_urls = list()
        stored_vectors = list()
        for url in vectors:
            stored_urls.append(url)
            stored_vectors.append(vectors[url])
        pair_sims = cosine_similarity(user_vector.reshape(1, -1), stored_vectors)
        indices = (-pair_sims[0]).argsort()[:5]
        return [(stored_urls[index],round(float(pair_sims[0][index]),2)) for index in indices]
    else:
        print("URL returned status code", r.status_code)
        raise ValueError('URL error')

Source File: helpers.py From fnc-1 with Apache License 2.0

6 votes

def cosine_sim(x, y):
    try:
        if type(x) is np.ndarray: x = x.reshape(1, -1) # get rid of the warning
        if type(y) is np.ndarray: y = y.reshape(1, -1)
        d = cosine_similarity(x, y)
        d = d[0][0]
    except:
        print x
        print y
        d = 0.
    return d

 #   Copyright 2017 Cisco Systems, Inc.
 #  
 #   Licensed under the Apache License, Version 2.0 (the "License");
 #   you may not use this file except in compliance with the License.
 #   You may obtain a copy of the License at
 #  
 #     http://www.apache.org/licenses/LICENSE-2.0
 #  
 #   Unless required by applicable law or agreed to in writing, software
 #   distributed under the License is distributed on an "AS IS" BASIS,
 #   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 #   See the License for the specific language governing permissions and
 #   limitations under the License.

Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0

6 votes

def _get_similarity_values(self, q1_csc, q2_csc):
        cosine_sim = []
        manhattan_dis = []
        eucledian_dis = []
        jaccard_dis = []
        minkowsk_dis = []
        
        for i,j in zip(q1_csc, q2_csc):
            sim = cs(i, j)
            cosine_sim.append(sim[0][0])
            sim = md(i, j)
            manhattan_dis.append(sim[0][0])
            sim = ed(i, j)
            eucledian_dis.append(sim[0][0])
            i_ = i.toarray()
            j_ = j.toarray()
            try:
                sim = jsc(i_, j_)
                jaccard_dis.append(sim)
            except:
                jaccard_dis.append(0)
                
            sim = minkowski_dis.pairwise(i_, j_)
            minkowsk_dis.append(sim[0][0])
        return cosine_sim, manhattan_dis, eucledian_dis, jaccard_dis, minkowsk_dis

Source File: itemitem.py From Hands-on-Supervised-Machine-Learning-with-Python with MIT License

6 votes

def _compute_sim(self, R, k):
        # compute the similarity between all the items. This calculates the
        # similarity between each ITEM
        sim = cosine_similarity(R.T)

        # Only keep the similarities of the top K, setting all others to zero
        # (negative since we want descending)
        not_top_k = np.argsort(-sim, axis=1)[:, k:]  # shape=(n_items, k)

        if not_top_k.shape[1]:  # only if there are cols (k < n_items)
            # now we have to set these to zero in the similarity matrix
            row_indices = np.repeat(range(not_top_k.shape[0]),
                                    not_top_k.shape[1])
            sim[row_indices, not_top_k.ravel()] = 0.

        return sim

Source File: saxvsm.py From pyts with BSD 3-Clause "New" or "Revised" License

6 votes

def decision_function(self, X):
        """Evaluate the cosine similarity between document-term matrix and X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_timestamps)
            Test samples.

        Returns
        -------
        X : array-like, shape (n_samples, n_classes)
            osine similarity between the document-term matrix and X.

        """
        check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_',
                               '_tfidf', 'classes_'])
        X = check_array(X)
        X_bow = self._bow.transform(X)
        vectorizer = CountVectorizer(vocabulary=self._tfidf.vocabulary_)
        X_transformed = vectorizer.transform(X_bow).toarray()
        return cosine_similarity(X_transformed, self.tfidf_)

Source File: test_skater.py From region with BSD 3-Clause "New" or "Revised" License

6 votes

def test_init():
    default = Spanning_Forest()
    assert default.metric == skm.manhattan_distances
    assert default.center == np.mean
    assert default.reduction == np.sum
    change = Spanning_Forest(dissimilarity=skm.euclidean_distances,
                             center=np.median, reduction=np.max)
    assert change.metric == skm.euclidean_distances
    assert change.center == np.median
    assert change.reduction == np.max
    
    sym = Spanning_Forest(affinity=skm.cosine_similarity)
    assert isinstance(sym.metric, types.LambdaType)
    test_distance = -np.log(skm.cosine_similarity(data[:2,]))
    comparator = sym.metric(data[:2,])
    np.testing.assert_allclose(test_distance, comparator)

Source File: save_utils.py From keras-glove with MIT License

6 votes

def save_model(model: Model, tokenizer: Tokenizer):
    """
    Saves the important parts of the model
    :param model: Keras model to save
    :param tokenizer: Keras Tokenizer to save
    """
    for layer in model.layers:
        if '_biases' in layer.name or '_embeddings' in layer.name:
            np.save(file=f'{OUTPUT_FOLDER}{layer.name}', arr=layer.get_weights()[0])

    # save tokenizer
    pickle.dump(obj=tokenizer.index_word, file=open(f'{OUTPUT_FOLDER}{INDEX2WORD}', 'wb'))
    pickle.dump(obj=tokenizer.word_index, file=open(f'{OUTPUT_FOLDER}{WORD2INDEX}', 'wb'))

    # save combined embeddings & correlation matrix
    agg_embeddings = np.load(f'{OUTPUT_FOLDER}{CENTRAL_EMBEDDINGS}.npy') + \
                     np.load(f'{OUTPUT_FOLDER}{CONTEXT_EMBEDDINGS}.npy')

    np.save(file=f'{OUTPUT_FOLDER}{AGGREGATED_EMBEDDINGS}', arr=agg_embeddings)
    np.save(file=f'{OUTPUT_FOLDER}{CORRELATION_MATRIX}', arr=cosine_similarity(cosine_similarity(agg_embeddings)))

Source File: test_pairwise.py From twitter-stock-recommendation with MIT License

5 votes

def test_kernel_symmetry():
    # Valid kernels should be symmetric
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    for kernel in (linear_kernel, polynomial_kernel, rbf_kernel,
                   laplacian_kernel, sigmoid_kernel, cosine_similarity):
        K = kernel(X, X)
        assert_array_almost_equal(K, K.T, 15)

Source File: face_recognition.py From FindFaceInVideo with BSD 2-Clause "Simplified" License

5 votes

def compare_pic(feature1, feature2):
    predicts = pw.cosine_similarity(feature1, feature2);
    return predicts;

Source File: test_bert_sentence_encoding.py From nlp-recipes with MIT License

5 votes

def test_sentence_encoding(tmp, data):
    se = BERTSentenceEncoder(
        language=Language.ENGLISH,
        num_gpus=0,
        to_lower=True,
        max_len=128,
        layer_index=-2,
        pooling_strategy=PoolingStrategy.MEAN,
        cache_dir=tmp,
    )

    result = se.encode(data, as_numpy=False)
    similarity = cosine_similarity(result["values"].values.tolist())
    assert similarity[0, 0] > similarity[1, 0]
    assert similarity[0, 1] > similarity[0, 2]

Source File: test_pairwise.py From twitter-stock-recommendation with MIT License

5 votes

def test_kernel_sparse():
    rng = np.random.RandomState(0)
    X = rng.random_sample((5, 4))
    X_sparse = csr_matrix(X)
    for kernel in (linear_kernel, polynomial_kernel, rbf_kernel,
                   laplacian_kernel, sigmoid_kernel, cosine_similarity):
        K = kernel(X, X)
        K2 = kernel(X_sparse, X_sparse)
        assert_array_almost_equal(K, K2)

Source File: inltk.py From inltk with MIT License

5 votes

def get_similar_sentences(sen: str, no_of_variations: int, language_code: str, degree_of_aug: float = 0.1):
    check_input_language(language_code)
    # get embedding vectors for sen
    tok = LanguageTokenizer(language_code)
    token_ids = tok.numericalize(sen)
    embedding_vectors = get_embedding_vectors(sen, language_code)
    # get learner
    defaults.device = torch.device('cpu')
    path = Path(__file__).parent
    learn = load_learner(path / 'models' / f'{language_code}')
    encoder = get_model(learn.model)[0]
    encoder.reset()
    embeddings = encoder.state_dict()['encoder.weight']
    embeddings = np.array(embeddings)
    # cos similarity of vectors
    scores = cosine_similarity(embedding_vectors,embeddings)
    word_ids = [np.argpartition(-np.array(score), no_of_variations+1)[:no_of_variations+1] for score in scores]
    word_ids = [ids.tolist() for ids in word_ids]
    for i, ids in enumerate(word_ids):
        word_ids[i] = [wid for wid in word_ids[i] if wid != token_ids[i]]
    # generating more variations than required so that we can then filter out the best ones
    buffer_multiplicity = 2
    new_sen_tokens = []
    for i in range(no_of_variations):
        for k in range(buffer_multiplicity):
            new_token_ids = []
            ids = sorted(random.sample(range(len(token_ids)), max(1, int(degree_of_aug * len(token_ids)))))
            for j in range(len(token_ids)):
                if j in ids:
                    new_token_ids.append(word_ids[j][(i + k) % len(word_ids[j])])
                else:
                    new_token_ids.append(token_ids[j])
            new_token_ids = list(map(lambda x: int(x), new_token_ids))
            new_sen_tokens.append(new_token_ids)
    new_sens = [tok.textify(sen_tokens) for sen_tokens in new_sen_tokens]
    while sen in new_sens:
        new_sens.remove(sen)
    sen_with_sim_score = [(new_sen, get_sentence_similarity(sen, new_sen, language_code)) for new_sen in new_sens]
    sen_with_sim_score.sort(key=lambda x: x[1], reverse=True)
    new_sens = [sen for sen, _ in sen_with_sim_score]
    return new_sens[:no_of_variations]

Source File: bossvs.py From pyts with BSD 3-Clause "New" or "Revised" License

5 votes

def decision_function(self, X):
        """Evaluate the cosine similarity between document-term matrix and X.

        Parameters
        ----------
        X : array-like, shape (n_samples, n_timestamps)
            Test samples.

        Returns
        -------
        X : array, shape (n_samples, n_classes)
            Cosine similarity between the document-term matrix and X.

        """
        check_is_fitted(self, ['vocabulary_', 'tfidf_', 'idf_', '_tfidf'])
        X = check_array(X, dtype='float64')
        n_samples, n_timestamps = X.shape

        X_windowed = _windowed_view(
            X, n_samples, n_timestamps, self._window_size, self._window_step
        )
        X_windowed = X_windowed.reshape(-1, self._window_size)

        X_sfa = self._sfa.transform(X_windowed)
        X_word = np.asarray([''.join(X_sfa[i]) for i in range(X_sfa.shape[0])])
        X_word = X_word.reshape(n_samples, self._n_windows)

        if self.numerosity_reduction:
            not_equal = np.c_[X_word[:, 1:] != X_word[:, :-1],
                              np.full(n_samples, True)]
            X_bow = np.asarray([' '.join(X_word[i, not_equal[i]])
                                for i in range(n_samples)])
        else:
            X_bow = np.asarray([' '.join(X_word[i]) for i in range(n_samples)])

        X_tf = self._tfidf.transform(X_bow).toarray()
        if self.idf_ is not None:
            X_tf /= self.idf_
        return cosine_similarity(X_tf, self.tfidf_)

Source File: qmath.py From RecQ with GNU General Public License v3.0

5 votes

def cosine(x1,x2):
    #find common ratings
    #new_x1, new_x2 = common(x1,x2)
    #compute the cosine similarity between two vectors
    sum = x1.dot(x2)
    denom = sqrt(x1.dot(x1)*x2.dot(x2))
    try:
        return float(sum)/denom
    except ZeroDivisionError:
        return 0

    #return cosine_similarity(x1,x2)[0][0]

Source File: fever_features.py From fever-naacl-2018 with Apache License 2.0

5 votes

def process(self,data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c,b in zip(claim_tfidf,body_tfidf)])

        return hstack([body_tfs,claim_tfs,cosines])

Source File: process_tfidf_grid.py From fever-naacl-2018 with Apache License 2.0

5 votes

def process(self, data):
        claim_bow = self.bow_vectorizer.transform(self.claims(data))
        claim_tfs = self.tfreq_vectorizer.transform(claim_bow)
        claim_tfidf = self.tfidf_vectorizer.transform(self.claims(data))

        body_texts = self.texts(data)
        body_bow = self.bow_vectorizer.transform(body_texts)
        body_tfs = self.tfreq_vectorizer.transform(body_bow)
        body_tfidf = self.tfidf_vectorizer.transform(body_texts)

        cosines = np.array([cosine_similarity(c, b)[0] for c, b in zip(claim_tfidf, body_tfidf)])

        return cosines

Source File: feature_engineering.py From CIKM-AnalytiCup-2018 with Apache License 2.0

5 votes

def _create_weighted_distance_features(self, df):
        q1_matrix = self.tfidf_vectorizer.transform(df['spn_1'].values.tolist())
        q2_matrix = self.tfidf_vectorizer.transform(df['spn_2'].values.tolist())
        df['weighted_cosine_sim'] = np.concatenate([cs(q1_matrix[i], q2_matrix[i]).flatten() for i in range(q1_matrix.shape[0])])
        #df['weighted_eucledian_dis'] = np.square((q1_matrix - q2_matrix).toarray()).sum(axis=1)

Source File: scorer.py From entity2vec with Apache License 2.0

5 votes

def similarity_function(vec1,vec2, similarity):
    
    #compute cosine similarity or other similarities

    v1 = np.array(vec1)

    v2 = np.array(vec2)

    if len(v1)*len(v2) == 0: #any of the two is 0
        global count
        count +=1

        return 0

    else:

        if similarity == 'cosine':

            return cosine_similarity([v1],[v2])[0][0] #returns a double array [[sim]]

        elif similarity == 'softmax':

            return np.exp(np.dot(v1,v2)) #normalization is useless for relative comparisons

        elif similarity == 'linear_kernel':
            return linear_kernel(v1,v2)[0][0]

        elif similarity == 'euclidean':
            return euclidean_distances(v1,v2)[0][0]
        else:
            raise NameError('Choose a valid similarity function')

Source File: spine_sample.py From ikelos with MIT License

5 votes

def on_epoch_end(self, epoch, logs={}):
        indices = np.random.choice(len(self.spine_embedder), 10, False)
        comparisons = cosine_similarity(self.spine_embedder[indices], self.spine_embedder)
        results = np.argmax(comparisons, axis=-1)
        spine_vocab = self.igor.vocabs.spines
        comp_spines = [spine_vocab.lookup(x) for x in results]
        in_spines = [spine_vocab.lookup(x) for x in indices]
        for spine_i, spine_j in zip(in_spines, comp_spines):
            print("SPINE: {}".format(self.decode(spine_i)))
            print("\t most similar to {}".format(self.decode(spine_j)))

Source File: dist_utils.py From kaggle-HomeDepot with MIT License

5 votes

def _cosine_sim(vec1, vec2):
    try:
        s = cosine_similarity(vec1.reshape(1, -1), vec2.reshape(1, -1))[0][0]
    except:
        try:
            s = cosine_similarity(vec1, vec2)[0][0]
        except:
            s = config.MISSING_VALUE_NUMERIC
    return s

Source File: tcga_benchmark.py From perfect_match with MIT License

5 votes

def get_centroid_weights(self, x):
        similarities = map(lambda indices, centroid: cosine_similarity(x[indices].reshape(1, -1),
                                                                       centroid.reshape(1, -1)),
                           map(lambda x: x[0], self.centroids),
                           map(lambda x: x[1], self.centroids))
        return np.squeeze(similarities)

Source File: twins_benchmark.py From perfect_match with MIT License

5 votes

def get_centroid_weights(self, x):
        similarities = map(
            lambda centroid: cosine_similarity(self.data_access.standardise_entry(
                                                   np.array(x[7:], dtype="float32")
                                               ).reshape((1, -1)),
                                               centroid.reshape((1, -1))),
            map(lambda x: x[0], self.centroids)
        )
        return np.squeeze(similarities)

Source File: feature_axis.py From transparent_latent_gan with MIT License

5 votes

def plot_feature_cos_sim(feature_direction, feature_name=None):
    """
    plot cosine similarity measure of vectors

    :param feature_direction: vectors, shape = (num_dimension, num_vector)
    :param feature_name:      list of names of features
    :return:                  cosines similarity matrix, shape = (num_vector, num_vector)
    """
    import matplotlib.pyplot as plt
    from sklearn.metrics.pairwise import cosine_similarity

    len_z, len_y = feature_direction.shape
    if feature_name is None:
        feature_name = range(len_y)

    feature_cos_sim = cosine_similarity(feature_direction.transpose())

    c_lim_abs = np.max(np.abs(feature_cos_sim))

    plt.pcolormesh(np.arange(len_y+1), np.arange(len_y+1), feature_cos_sim,
                   vmin=-c_lim_abs, vmax=+c_lim_abs, cmap='coolwarm')
    plt.gca().invert_yaxis()
    plt.colorbar()
    # plt.axis('square')
    plt.xticks(np.arange(len_y) + 0.5, feature_name, fontsize='x-small', rotation='vertical')
    plt.yticks(np.arange(len_y) + 0.5, feature_name, fontsize='x-small')
    plt.show()
    return feature_cos_sim

Source File: test_saxvsm.py From pyts with BSD 3-Clause "New" or "Revised" License

5 votes

def test_actual_results_strategy_uniform():
    """Test that the actual results are the expected ones."""
    # Data
    X = [[0, 0, 0, 1, 0, 0, 1, 1, 1],
         [0, 1, 1, 1, 0, 0, 1, 1, 1],
         [0, 0, 0, 1, 0, 0, 0, 1, 0]]
    y = [0, 0, 1]

    clf = SAXVSM(window_size=4, word_size=4, n_bins=2, strategy='uniform',
                 numerosity_reduction=False, sublinear_tf=False)
    decision_function_actual = clf.fit(X, y).decision_function(X)

    # X_bow = ["aaab aaba abaa baab aabb abbb",
    #          "abbb bbba bbaa baab aabb abbb",
    #          "aaab aaba abaa baaa aaab aaba"]

    assert clf.vocabulary_ == {0: 'aaab', 1: 'aaba', 2: 'aabb', 3: 'abaa',
                               4: 'abbb', 5: 'baaa', 6: 'baab', 7: 'bbaa',
                               8: 'bbba'}

    freq = np.asarray([[1, 1, 1, 1, 1, 0, 1, 0, 0],
                       [0, 0, 1, 0, 2, 0, 1, 1, 1],
                       [2, 2, 0, 1, 0, 1, 0, 0, 0]])
    tf = np.asarray([[1, 1, 2, 1, 3, 0, 2, 1, 1],
                     [2, 2, 0, 1, 0, 1, 0, 0, 0]])
    idf = np.asarray([1, 1, log(2) + 1, 1, log(2) + 1, log(2) + 1, log(2) + 1,
                      log(2) + 1, log(2) + 1])
    decision_function_desired = cosine_similarity(freq, tf * idf[None, :])
    np.testing.assert_allclose(decision_function_actual,
                               decision_function_desired, atol=1e-5, rtol=0.)

    pred_actual = clf.predict(X)
    pred_desired = cosine_similarity(freq, tf * idf[None, :]).argmax(axis=1)
    np.testing.assert_array_equal(pred_actual, pred_desired)

Python sklearn.metrics.pairwise.cosine_similarity() Examples