org.nd4j.linalg.ops.transforms.Transforms#cosineSim

Source File: ParagraphVectors.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method returns similarity of the document to specific label, based on mean value
 *
 * @param document
 * @param label
 * @return
 */
public double similarityToLabel(List<VocabWord> document, String label) {
    if (document.isEmpty())
        throw new IllegalStateException("Document has no words inside");

    /*
    INDArray arr = Nd4j.create(document.size(), this.layerSize);
    for (int i = 0; i < document.size(); i++) {
        arr.putRow(i, getWordVectorMatrix(document.get(i).getWord()));
    }*/

    INDArray docMean = inferVector(document); //arr.mean(0);

    INDArray otherVec = getWordVectorMatrix(label);
    double sim = Transforms.cosineSim(docMean, otherVec);
    return sim;
}

Source File: ParagraphVectors.java From deeplearning4j with Apache License 2.0

6 votes

/**
     * Predict several labels based on the document.
     * Computes a similarity wrt the mean of the
     * representation of words in the document
     * @param document the document
     * @return possible labels in descending order
     */
    public Collection<String> predictSeveral(List<VocabWord> document, int limit) {
        /*
            This code was transferred from original ParagraphVectors DL4j implementation, and yet to be tested
         */
        if (document.isEmpty())
            throw new IllegalStateException("Document has no words inside");
/*
        INDArray arr = Nd4j.create(document.size(), this.layerSize);
        for (int i = 0; i < document.size(); i++) {
            arr.putRow(i, getWordVectorMatrix(document.get(i).getWord()));
        }
*/
        INDArray docMean = inferVector(document); //arr.mean(0);
        Counter<String> distances = new Counter<>();

        for (String s : labelsSource.getLabels()) {
            INDArray otherVec = getWordVectorMatrix(s);
            double sim = Transforms.cosineSim(docMean, otherVec);
            log.debug("Similarity inside: [" + s + "] -> " + sim);
            distances.incrementCount(s, (float) sim);
        }

        val keys = distances.keySetSorted();
        return keys.subList(0, Math.min(limit, keys.size()));
    }

Source File: ParagraphVectors.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method predicts label of the document.
 * Computes a similarity wrt the mean of the
 * representation of words in the document
 * @param document the document
 * @return the word distances for each label
 */
public String predict(List<VocabWord> document) {
    /*
        This code was transferred from original ParagraphVectors DL4j implementation, and yet to be tested
     */
    if (document.isEmpty())
        throw new IllegalStateException("Document has no words inside");

    /*
    INDArray arr = Nd4j.create(document.size(), this.layerSize);
    for (int i = 0; i < document.size(); i++) {
        arr.putRow(i, getWordVectorMatrix(document.get(i).getWord()));
    }*/

    INDArray docMean = inferVector(document); //arr.mean(0);
    Counter<String> distances = new Counter<>();

    for (String s : labelsSource.getLabels()) {
        INDArray otherVec = getWordVectorMatrix(s);
        double sim = Transforms.cosineSim(docMean, otherVec);
        distances.incrementCount(s, (float) sim);
    }

    return distances.argMax();
}

Source File: FlatModelUtils.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method does full scan against whole vocabulary, building descending list of similar words
 *
 * @param words
 * @param top
 * @return the words nearest the mean of the words
 */
@Override
public Collection<String> wordsNearest(INDArray words, int top) {
    Counter<String> distances = new Counter<>();

    words = adjustRank(words);

    for (String s : vocabCache.words()) {
        INDArray otherVec = lookupTable.vector(s);
        double sim = Transforms.cosineSim(Transforms.unitVec(words.dup()), Transforms.unitVec(otherVec.dup()));
        distances.incrementCount(s, (float) sim);
    }

    distances.keepTopNElements(top);
    return distances.keySetSorted();
}

Source File: Word2VecTests.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Adding test for cosine similarity, to track changes in Transforms.cosineSim()
 */
@Test
public void testCosineSim() {
    double[] array1 = new double[] {1.01, 0.91, 0.81, 0.71};
    double[] array2 = new double[] {1.01, 0.91, 0.81, 0.71};
    double[] array3 = new double[] {1.0, 0.9, 0.8, 0.7};

    double sim12 = Transforms.cosineSim(Nd4j.create(array1), Nd4j.create(array2));
    double sim23 = Transforms.cosineSim(Nd4j.create(array2), Nd4j.create(array3));
    log.info("Arrays 1/2 cosineSim: " + sim12);
    log.info("Arrays 2/3 cosineSim: " + sim23);
    log.info("Arrays 1/2 dot: " + Nd4j.getBlasWrapper().dot(Nd4j.create(array1), Nd4j.create(array2)));
    log.info("Arrays 2/3 dot: " + Nd4j.getBlasWrapper().dot(Nd4j.create(array2), Nd4j.create(array3)));

    assertEquals(1.0d, sim12, 0.01d);
    assertEquals(0.99d, sim23, 0.01d);
}

Source File: OpExecutionerTests.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testCosineSimilarity() {
    INDArray vec1 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    INDArray vec2 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);

}

Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0

5 votes

@Deprecated
private double arraysSimilarity(@NonNull INDArray array1, @NonNull INDArray array2) {
    if (array1.equals(array2))
        return 1.0;

    INDArray vector = Transforms.unitVec(array1);
    INDArray vector2 = Transforms.unitVec(array2);

    if (vector == null || vector2 == null)
        return -1;

    return Transforms.cosineSim(vector, vector2);

}

Source File: ParagraphVectors.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method returns top N labels nearest to specified features vector
 *
 * @param labelVector
 * @param topN
 * @return
 */
public Collection<String> nearestLabels(INDArray labelVector, int topN) {
    if (labelsMatrix == null || labelsList == null || labelsList.isEmpty())
        extractLabels();

    List<BasicModelUtils.WordSimilarity> result = new ArrayList<>();

    // if list still empty - return empty collection
    if (labelsMatrix == null || labelsList == null || labelsList.isEmpty()) {
        log.warn("Labels list is empty!");
        return new ArrayList<>();
    }

    if (!normalizedLabels) {
        synchronized (this) {
            if (!normalizedLabels) {
                labelsMatrix.diviColumnVector(labelsMatrix.norm1(1));
                normalizedLabels = true;
            }
        }
    }

    INDArray similarity = Transforms.unitVec(labelVector).mmul(labelsMatrix.transpose());
    List<Double> highToLowSimList = getTopN(similarity, topN + 20);

    for (int i = 0; i < highToLowSimList.size(); i++) {
        String word = labelsList.get(highToLowSimList.get(i).intValue()).getLabel();
        if (word != null && !word.equals("UNK") && !word.equals("STOP")) {
            INDArray otherVec = lookupTable.vector(word);
            double sim = Transforms.cosineSim(labelVector, otherVec);

            result.add(new BasicModelUtils.WordSimilarity(word, sim));
        }
    }

    Collections.sort(result, new BasicModelUtils.SimilarityComparator());

    return BasicModelUtils.getLabels(result, topN);
}

Source File: BasicModelUtils.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Returns the similarity of 2 words. Result value will be in range [-1,1], where -1.0 is exact opposite similarity, i.e. NO similarity, and 1.0 is total match of two word vectors.
 * However, most of time you'll see values in range [0,1], but that's something depends of training corpus.
 *
 * Returns NaN if any of labels not exists in vocab, or any label is null
 *
 * @param label1 the first word
 * @param label2 the second word
 * @return a normalized similarity (cosine similarity)
 */
@Override
public double similarity(@NonNull String label1, @NonNull String label2) {
    if (label1 == null || label2 == null) {
        log.debug("LABELS: " + label1 + ": " + (label1 == null ? "null" : EXISTS) + ";" + label2 + " vec2:"
                        + (label2 == null ? "null" : EXISTS));
        return Double.NaN;
    }

    if (!vocabCache.hasToken(label1)) {
        log.debug("Unknown token 1 requested: [{}]", label1);
        return Double.NaN;
    }

    if (!vocabCache.hasToken(label2)) {
        log.debug("Unknown token 2 requested: [{}]", label2);
        return Double.NaN;
    }

    INDArray vec1 = lookupTable.vector(label1).dup();
    INDArray vec2 = lookupTable.vector(label2).dup();


    if (vec1 == null || vec2 == null) {
        log.debug(label1 + ": " + (vec1 == null ? "null" : EXISTS) + ";" + label2 + " vec2:"
                        + (vec2 == null ? "null" : EXISTS));
        return Double.NaN;
    }

    if (label1.equals(label2))
        return 1.0;

    return Transforms.cosineSim(vec1, vec2);
}

Source File: NDArrayTestsFortran.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testCosineSim() {
    INDArray vec1 = Nd4j.create(new double[] {1, 2, 3, 4});
    INDArray vec2 = Nd4j.create(new double[] {1, 2, 3, 4});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);

    INDArray vec3 = Nd4j.create(new float[] {0.2f, 0.3f, 0.4f, 0.5f});
    INDArray vec4 = Nd4j.create(new float[] {0.6f, 0.7f, 0.8f, 0.9f});
    sim = Transforms.cosineSim(vec3, vec4);
    assertEquals(getFailureMessage(), 0.98, sim, 1e-1);

}

Source File: NDArrayDistanceTransform.java From DataVec with Apache License 2.0

5 votes

@Override
public List<Writable> map(List<Writable> writables) {
    int idxFirst = inputSchema.getIndexOfColumn(firstCol);
    int idxSecond = inputSchema.getIndexOfColumn(secondCol);

    INDArray arr1 = ((NDArrayWritable) writables.get(idxFirst)).get();
    INDArray arr2 = ((NDArrayWritable) writables.get(idxSecond)).get();

    double d;
    switch (distance) {
        case COSINE:
            d = Transforms.cosineSim(arr1, arr2);
            break;
        case EUCLIDEAN:
            d = Transforms.euclideanDistance(arr1, arr2);
            break;
        case MANHATTAN:
            d = Transforms.manhattanDistance(arr1, arr2);
            break;
        default:
            throw new UnsupportedOperationException("Unknown or not supported distance metric: " + distance);
    }

    List<Writable> out = new ArrayList<>(writables.size() + 1);
    out.addAll(writables);
    out.add(new DoubleWritable(d));

    return out;
}

Source File: OpExecutionerTestsC.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testCosineSimilarity() {
    INDArray vec1 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    INDArray vec2 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);
}

Source File: NDArrayDistanceTransform.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public List<Writable> map(List<Writable> writables) {
    int idxFirst = inputSchema.getIndexOfColumn(firstCol);
    int idxSecond = inputSchema.getIndexOfColumn(secondCol);

    INDArray arr1 = ((NDArrayWritable) writables.get(idxFirst)).get();
    INDArray arr2 = ((NDArrayWritable) writables.get(idxSecond)).get();

    double d;
    switch (distance) {
        case COSINE:
            d = Transforms.cosineSim(arr1, arr2);
            break;
        case EUCLIDEAN:
            d = Transforms.euclideanDistance(arr1, arr2);
            break;
        case MANHATTAN:
            d = Transforms.manhattanDistance(arr1, arr2);
            break;
        default:
            throw new UnsupportedOperationException("Unknown or not supported distance metric: " + distance);
    }

    List<Writable> out = new ArrayList<>(writables.size() + 1);
    out.addAll(writables);
    out.add(new DoubleWritable(d));

    return out;
}

Source File: NDArrayTestsFortran.java From nd4j with Apache License 2.0

5 votes

@Test
public void testCosineSim() {
    INDArray vec1 = Nd4j.create(new double[] {1, 2, 3, 4});
    INDArray vec2 = Nd4j.create(new double[] {1, 2, 3, 4});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);

    INDArray vec3 = Nd4j.create(new float[] {0.2f, 0.3f, 0.4f, 0.5f});
    INDArray vec4 = Nd4j.create(new float[] {0.6f, 0.7f, 0.8f, 0.9f});
    sim = Transforms.cosineSim(vec3, vec4);
    assertEquals(getFailureMessage(), 0.98, sim, 1e-1);

}

Source File: OpExecutionerTestsC.java From nd4j with Apache License 2.0

5 votes

@Test
public void testCosineSimilarity() {
    INDArray vec1 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    INDArray vec2 = Nd4j.create(new float[] {1, 2, 3, 4, 5});
    double sim = Transforms.cosineSim(vec1, vec2);
    assertEquals(getFailureMessage(), 1, sim, 1e-1);
}

Source File: CudaReduce3Tests.java From nd4j with Apache License 2.0

5 votes

/**
 * Norm2 + cuBlas dot call
 *
 * @throws Exception
 */
@Test
public void testPinnedCosineSim() throws Exception {
    // simple way to stop test if we're not on CUDA backend here

    INDArray array1 = Nd4j.create(new float[]{2.01f, 2.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f});
    INDArray array2 = Nd4j.create(new float[]{1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f});


    double similarity = Transforms.cosineSim(array1, array2);

    System.out.println("Cosine similarity: " + similarity);
    assertEquals(0.95f, similarity, 0.01f);
}

Source File: HalfOpsTests.java From nd4j with Apache License 2.0

5 votes

@Ignore
@Test
public void testReduce3_2() throws Exception {
    INDArray array1 = Nd4j.create(new float[]{2.01f, 2.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f, 1.01f});
    INDArray array2 = Nd4j.create(new float[]{1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f, 1.00f});


    double similarity = Transforms.cosineSim(array1, array2);

    System.out.println("Cosine similarity: " + similarity);
    assertEquals(0.95f, similarity, 0.01f);
}

Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0

4 votes

@Test(timeout = 300000)
public void testParagraphVectorsDBOW() throws Exception {
    skipUnlessIntegrationTests();

    File file = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(file);

    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1)
                    .layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
                    .trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0)
                    .allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(4)
                    .usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();

    vec.fit();

    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn0().isAttached());
    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn1().isAttached());

    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");

    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);

    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);

    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);

    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);

    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);

    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);


    // testing DM inference now

    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    INDArray inferredC1 = vec.inferVector("This is my day");
    INDArray inferredD1 = vec.inferVector("This is my night");

    log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
    log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));

    assertNotEquals(inferredA1, inferredC1);

    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
    double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());

    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
    log.info("Cos A/C: {}", cosAC1);
    log.info("Cos C/D: {}", cosCD1);

}

Source File: TestNDArrayWritableTransforms.java From deeplearning4j with Apache License 2.0

3 votes

@Test
public void testNDArrayDistanceTransform() {

    Schema s = new Schema.Builder()

                    .addColumnDouble("col0").addColumnNDArray("col1", new long[] {1, 10})
                    .addColumnNDArray("col2", new long[] {1, 10}).build();


    TransformProcess tp = new TransformProcess.Builder(s)
                    .ndArrayDistanceTransform("dist", Distance.COSINE, "col1", "col2").build();



    List<String> expColNames = Arrays.asList("col0", "col1", "col2", "dist");
    assertEquals(expColNames, tp.getFinalSchema().getColumnNames());

    Nd4j.getRandom().setSeed(12345);
    INDArray arr1 = Nd4j.rand(1, 10);
    INDArray arr2 = Nd4j.rand(1, 10);
    double cosine = Transforms.cosineSim(arr1, arr2);

    List<Writable> in = Arrays.<Writable>asList(new DoubleWritable(0), new NDArrayWritable(arr1.dup()),
                    new NDArrayWritable(arr2.dup()));
    List<Writable> out = tp.execute(in);

    List<Writable> exp = Arrays.<Writable>asList(new DoubleWritable(0), new NDArrayWritable(arr1),
                    new NDArrayWritable(arr2), new DoubleWritable(cosine));

    assertEquals(exp, out);
}

Source File: TestNDArrayWritableTransforms.java From DataVec with Apache License 2.0

3 votes

@Test
public void testNDArrayDistanceTransform() {

    Schema s = new Schema.Builder()

                    .addColumnDouble("col0").addColumnNDArray("col1", new long[] {1, 10})
                    .addColumnNDArray("col2", new long[] {1, 10}).build();


    TransformProcess tp = new TransformProcess.Builder(s)
                    .ndArrayDistanceTransform("dist", Distance.COSINE, "col1", "col2").build();



    List<String> expColNames = Arrays.asList("col0", "col1", "col2", "dist");
    assertEquals(expColNames, tp.getFinalSchema().getColumnNames());

    Nd4j.getRandom().setSeed(12345);
    INDArray arr1 = Nd4j.rand(1, 10);
    INDArray arr2 = Nd4j.rand(1, 10);
    double cosine = Transforms.cosineSim(arr1, arr2);

    List<Writable> in = Arrays.<Writable>asList(new DoubleWritable(0), new NDArrayWritable(arr1.dup()),
                    new NDArrayWritable(arr2.dup()));
    List<Writable> out = tp.execute(in);

    List<Writable> exp = Arrays.<Writable>asList(new DoubleWritable(0), new NDArrayWritable(arr1),
                    new NDArrayWritable(arr2), new DoubleWritable(cosine));

    assertEquals(exp, out);
}

Java Code Examples for org.nd4j.linalg.ops.transforms.Transforms#cosineSim()