org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable Java Exaples

Source File: TSNEVisualizationExample.java From Java-Deep-Learning-Cookbook with MIT License

6 votes

public static void main(String[] args) throws IOException {
    Nd4j.setDataType(DataBuffer.Type.DOUBLE);
    List<String> cacheList = new ArrayList<>();
    File file = new File("words.txt");
    String outputFile = "tsne-standard-coords.csv";
    Pair<InMemoryLookupTable,VocabCache> vectors = WordVectorSerializer.loadTxt(file);
    VocabCache cache = vectors.getSecond();
    INDArray weights = vectors.getFirst().getSyn0();

    for(int i=0;i<cache.numWords();i++){
        cacheList.add(cache.wordAtIndex(i));
    }

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                                            .setMaxIter(100)
                                            .theta(0.5)
                                            .normalize(false)
                                            .learningRate(500)
                                            .useAdaGrad(false)
                                            .build();

    tsne.fit(weights);
    tsne.saveAsFile(cacheList,outputFile);

}

Source File: TSNEVisualizationExample.java From Java-Deep-Learning-Cookbook with MIT License

6 votes

public static void main(String[] args) throws IOException {
    Nd4j.setDataType(DataBuffer.Type.DOUBLE);
    List<String> cacheList = new ArrayList<>();
    File file = new File("words.txt");
    String outputFile = "tsne-standard-coords.csv";
    Pair<InMemoryLookupTable,VocabCache> vectors = WordVectorSerializer.loadTxt(file);
    VocabCache cache = vectors.getSecond();
    INDArray weights = vectors.getFirst().getSyn0();

    for(int i=0;i<cache.numWords();i++){
        cacheList.add(cache.wordAtIndex(i));
    }

    BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                                            .setMaxIter(100)
                                            .theta(0.5)
                                            .normalize(false)
                                            .learningRate(500)
                                            .useAdaGrad(false)
                                            .build();

    tsne.fit(weights);
    tsne.saveAsFile(cacheList,outputFile);

}

Source File: DM.java From deeplearning4j with Apache License 2.0

6 votes

@Override
public void configure(@NonNull VocabCache<T> vocabCache, @NonNull WeightLookupTable<T> lookupTable,
                @NonNull VectorsConfiguration configuration) {
    this.vocabCache = vocabCache;
    this.lookupTable = lookupTable;
    this.configuration = configuration;

    cbow.configure(vocabCache, lookupTable, configuration);

    this.window = configuration.getWindow();
    this.useAdaGrad = configuration.isUseAdaGrad();
    this.negative = configuration.getNegative();
    this.sampling = configuration.getSampling();

    this.syn0 = ((InMemoryLookupTable<T>) lookupTable).getSyn0();
    this.syn1 = ((InMemoryLookupTable<T>) lookupTable).getSyn1();
    this.syn1Neg = ((InMemoryLookupTable<T>) lookupTable).getSyn1Neg();
    this.expTable = ((InMemoryLookupTable<T>) lookupTable).getExpTable();
    this.table = ((InMemoryLookupTable<T>) lookupTable).getTable();
}

Source File: BasicModelUtils.java From deeplearning4j with Apache License 2.0

6 votes

protected INDArray adjustRank(INDArray words) {
    if (lookupTable instanceof InMemoryLookupTable) {
        InMemoryLookupTable l = (InMemoryLookupTable) lookupTable;

        INDArray syn0 = l.getSyn0();
        if (!words.dataType().equals(syn0.dataType())) {
            return words.castTo(syn0.dataType());
        }
        if (words.rank() == 0 || words.rank() > 2) {
            throw new IllegalStateException("Invalid rank for wordsNearest method");
        } else if (words.rank() == 1) {
            return words.reshape(1, -1);
        }
    }
    return words;
}

Source File: WordVectorsImpl.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method returns 2D array, where each row represents corresponding label
 *
 * @param labels
 * @return
 */
@Override
public INDArray getWordVectors(@NonNull Collection<String> labels) {
    int indexes[] = new int[labels.size()];
    int cnt = 0;
    boolean useIndexUnknown = useUnknown && vocab.containsWord(getUNK());

    for (String label : labels) {
        if (vocab.containsWord(label)) {
            indexes[cnt] = vocab.indexOf(label);
        } else
            indexes[cnt] = useIndexUnknown ? vocab.indexOf(getUNK()) : -1;
        cnt++;
    }

    while (ArrayUtils.contains(indexes, -1)) {
        indexes = ArrayUtils.removeElement(indexes, -1);
    }
    if (indexes.length == 0) {
            return Nd4j.empty(((InMemoryLookupTable)lookupTable).getSyn0().dataType());
    }

    INDArray result = Nd4j.pullRows(lookupTable.getWeights(), 1, indexes);
    return result;
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method tests CSV file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testUnifiedLoaderArchive2() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertNotEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testUnifiedLoaderArchive1() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1Neg());
}

Source File: Word2VecParam.java From deeplearning4j with Apache License 2.0

6 votes

public Word2VecParam(boolean useAdaGrad, double negative, int numWords, INDArray table, int window,
                AtomicLong nextRandom, double alpha, double minAlpha, int totalWords, int lastChecked,
                Broadcast<AtomicLong> wordCount, InMemoryLookupTable weights, int vectorLength,
                Broadcast<double[]> expTable) {
    this.useAdaGrad = useAdaGrad;
    this.negative = negative;
    this.numWords = numWords;
    this.table = table;
    this.window = window;
    this.nextRandom = nextRandom;
    this.alpha = alpha;
    this.minAlpha = minAlpha;
    this.totalWords = totalWords;
    this.lastChecked = lastChecked;
    this.wordCount = wordCount;
    this.weights = weights;
    this.vectorLength = vectorLength;
    this.expTable = expTable;
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
@Ignore
public void testFromTableAndVocab() throws IOException {

    WordVectors vec = WordVectorSerializer.readWord2VecModel(textFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();

    WordVectors wordVectors = WordVectorSerializer.fromTableAndVocab(lookupTable, lookupCache);
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
@Ignore
public void testWriteWordVectors() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
    WordVectorSerializer.writeWordVectors(lookupTable, lookupCache, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}

Source File: SequenceVectors.java From deeplearning4j with Apache License 2.0

6 votes

private void initIntersectVectors() {
    if (intersectModel != null && intersectModel.vocab().numWords() > 0) {
        List<Integer> indexes = new ArrayList<>();
        for (int i = 0; i < intersectModel.vocab().numWords(); ++i) {
            String externalWord = intersectModel.vocab().wordAtIndex(i);
            int index = this.vocab.indexOf(externalWord);
            if (index >= 0) {
                this.vocab.wordFor(externalWord).setLocked(lockFactor);
                indexes.add(index);
            }
        }

        if (indexes.size() > 0) {
            int[] intersectIndexes = Ints.toArray(indexes);

            Nd4j.scatterUpdate(org.nd4j.linalg.api.ops.impl.scatter.ScatterUpdate.UpdateOp.ASSIGN,
                    ((InMemoryLookupTable<VocabWord>) lookupTable).getSyn0(),
                    Nd4j.createFromArray(intersectIndexes),
                    ((InMemoryLookupTable<VocabWord>) intersectModel.lookupTable()).getSyn0(),
                    1);
        }
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Writes the word vectors to the given path. Note that this assumes an in memory cache
 *
 * @param lookupTable
 * @param cache
 * @param path        the path to write
 * @throws IOException
 * @deprecated Use {@link #writeWord2VecModel(Word2Vec, File)} instead
 */
@Deprecated
public static void writeWordVectors(InMemoryLookupTable lookupTable, InMemoryLookupCache cache, String path)
        throws IOException {
    try (BufferedWriter write = new BufferedWriter(
            new OutputStreamWriter(new FileOutputStream(path, false), StandardCharsets.UTF_8))) {
        for (int i = 0; i < lookupTable.getSyn0().rows(); i++) {
            String word = cache.wordAtIndex(i);
            if (word == null) {
                continue;
            }
            StringBuilder sb = new StringBuilder();
            sb.append(word.replaceAll(" ", WHITESPACE_REPLACEMENT));
            sb.append(" ");
            INDArray wordVector = lookupTable.vector(word);
            for (int j = 0; j < wordVector.length(); j++) {
                sb.append(wordVector.getDouble(j));
                if (j < wordVector.length() - 1) {
                    sb.append(" ");
                }
            }
            sb.append("\n");
            write.write(sb.toString());

        }
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Load word vectors from the given pair
 *
 * @param pair the given pair
 * @return a read only word vectors impl based on the given lookup table and vocab
 */
public static Word2Vec fromPair(Pair<InMemoryLookupTable, VocabCache> pair) {
    Word2Vec vectors = new Word2Vec();
    vectors.setLookupTable(pair.getFirst());
    vectors.setVocab(pair.getSecond());
    vectors.setModelUtils(new BasicModelUtils());
    return vectors;
}

Source File: ParagraphVectors.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method allows you to use pre-built WordVectors model (e.g. Word2Vec) for ParagraphVectors.
 * Existing model will be transferred into new model before training starts.
 *
 * PLEASE NOTE: Non-normalized model is recommended to use here.
 *
 * @param vec existing WordVectors model
 * @return
 */
@Override
@SuppressWarnings("unchecked")
public Builder useExistingWordVectors(@NonNull WordVectors vec) {
    if (((InMemoryLookupTable<VocabWord>) vec.lookupTable()).getSyn1() == null
                    && ((InMemoryLookupTable<VocabWord>) vec.lookupTable()).getSyn1Neg() == null)
        throw new ND4JIllegalStateException("Model being passed as existing has no syn1/syn1Neg available");

    this.existingVectors = vec;
    return this;
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

public static Pair<InMemoryLookupTable, VocabCache> loadTxt(@NonNull File file) {
    try (InputStream inputStream = fileStream(file)) {
        return loadTxt(inputStream);
    } catch (IOException readTestException) {
        throw new RuntimeException(readTestException);
    }
}

Source File: Word2VecRawTextExample.java From Java-Data-Science-Cookbook with MIT License

5 votes

public static void main(String[] args) throws Exception {

        // Gets Path to Text file
        String filePath = "c:/raw_sentences.txt";

        log.info("Load & Vectorize Sentences....");
        // Strip white space before and after for each line
        SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath);
        // Split on white spaces in the line to get words
        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        InMemoryLookupCache cache = new InMemoryLookupCache();
        WeightLookupTable table = new InMemoryLookupTable.Builder()
                .vectorLength(100)
                .useAdaGrad(false)
                .cache(cache)
                .lr(0.025f).build();

        log.info("Building model....");
        Word2Vec vec = new Word2Vec.Builder()
                .minWordFrequency(5).iterations(1)
                .layerSize(100).lookupTable(table)
                .stopWords(new ArrayList<String>())
                .vocabCache(cache).seed(42)
                .windowSize(5).iterate(iter).tokenizerFactory(t).build();

        log.info("Fitting Word2Vec model....");
        vec.fit();

        log.info("Writing word vectors to text file....");
        // Write word
        WordVectorSerializer.writeWordVectors(vec, "word2vec.txt");

        log.info("Closest Words:");
        Collection<String> lst = vec.wordsNearest("man", 5); 
        System.out.println(lst);
        double cosSim = vec.similarity("cruise", "voyage");
        System.out.println(cosSim);
    }

Source File: CBOW.java From deeplearning4j with Apache License 2.0

5 votes

@Override
public void configure(@NonNull VocabCache<T> vocabCache, @NonNull WeightLookupTable<T> lookupTable,
                @NonNull VectorsConfiguration configuration) {
    this.vocabCache = vocabCache;
    this.lookupTable = lookupTable;
    this.configuration = configuration;

    this.window = configuration.getWindow();
    this.useAdaGrad = configuration.isUseAdaGrad();
    this.negative = configuration.getNegative();
    this.sampling = configuration.getSampling();

    if (configuration.getNegative() > 0) {
        if (((InMemoryLookupTable<T>) lookupTable).getSyn1Neg() == null) {
            logger.info("Initializing syn1Neg...");
            ((InMemoryLookupTable<T>) lookupTable).setUseHS(configuration.isUseHierarchicSoftmax());
            ((InMemoryLookupTable<T>) lookupTable).setNegative(configuration.getNegative());
            ((InMemoryLookupTable<T>) lookupTable).resetWeights(false);
        }
    }


    this.syn0 = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn0());
    this.syn1 = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1());
    this.syn1Neg = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1Neg());
    //this.expTable = new DeviceLocalNDArray(Nd4j.create(((InMemoryLookupTable<T>) lookupTable).getExpTable()));
    this.expTable = new DeviceLocalNDArray(Nd4j.create(((InMemoryLookupTable<T>) lookupTable).getExpTable(),
            new long[]{((InMemoryLookupTable<T>) lookupTable).getExpTable().length}, syn0.get().dataType()));
    this.table = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getTable());
    this.variableWindows = configuration.getVariableWindows();
}

Source File: SkipGram.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * SkipGram initialization over given vocabulary and WeightLookupTable
 *
 * @param vocabCache
 * @param lookupTable
 * @param configuration
 */
@Override
public void configure(@NonNull VocabCache<T> vocabCache, @NonNull WeightLookupTable<T> lookupTable,
                @NonNull VectorsConfiguration configuration) {
    this.vocabCache = vocabCache;
    this.lookupTable = lookupTable;
    this.configuration = configuration;

    if (configuration.getNegative() > 0) {
        if (((InMemoryLookupTable<T>) lookupTable).getSyn1Neg() == null) {
            log.info("Initializing syn1Neg...");
            ((InMemoryLookupTable<T>) lookupTable).setUseHS(configuration.isUseHierarchicSoftmax());
            ((InMemoryLookupTable<T>) lookupTable).setNegative(configuration.getNegative());
            ((InMemoryLookupTable<T>) lookupTable).resetWeights(false);
        }
    }

    this.syn0 = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn0());
    this.syn1 = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1());
    this.syn1Neg = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1Neg());
    this.expTable = new DeviceLocalNDArray(Nd4j.create(((InMemoryLookupTable<T>) lookupTable).getExpTable(),
                                           new long[]{((InMemoryLookupTable<T>) lookupTable).getExpTable().length}, syn0.get().dataType()));
    this.table = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getTable());



    this.window = configuration.getWindow();
    this.useAdaGrad = configuration.isUseAdaGrad();
    this.negative = configuration.getNegative();
    this.sampling = configuration.getSampling();
    this.variableWindows = configuration.getVariableWindows();

    this.vectorLength = configuration.getLayersSize();
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method loads Word2Vec model from csv file
 *
 * @param inputStream  input stream
 * @return Word2Vec model
 */
public static Word2Vec readAsCsv(@NonNull InputStream inputStream) {
    VectorsConfiguration configuration = new VectorsConfiguration();

    // let's try to load this file as csv file
    try {
        log.debug("Trying CSV model restoration...");

        Pair<InMemoryLookupTable, VocabCache> pair = loadTxt(inputStream);
        Word2Vec.Builder builder = new Word2Vec
                .Builder()
                .lookupTable(pair.getFirst())
                .useAdaGrad(false)
                .vocabCache(pair.getSecond())
                .layerSize(pair.getFirst().layerSize())
                // we don't use hs here, because model is incomplete
                .useHierarchicSoftmax(false)
                .resetModel(false);

        TokenizerFactory factory = getTokenizerFactory(configuration);
        if (factory != null) {
            builder.tokenizerFactory(factory);
        }

        return builder.build();
    } catch (Exception ex) {
        throw new RuntimeException("Unable to load model in CSV format");
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method saves table of weights to file
 *
 * @param weightLookupTable WeightLookupTable
 * @param file File
 */
public static <T extends SequenceElement>  void writeLookupTable(WeightLookupTable<T> weightLookupTable,
                                                                 @NonNull File file) throws IOException {
    try (BufferedWriter writer = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(file),
                                                                            StandardCharsets.UTF_8))) {
        int numWords = weightLookupTable.getVocabCache().numWords();
        int layersSize = weightLookupTable.layerSize();
        long totalNumberOfDocs = weightLookupTable.getVocabCache().totalNumberOfDocs();

        String format = "%d %d %d\n";
        String header = String.format(format, numWords, layersSize, totalNumberOfDocs);

        writer.write(header);

        String row = "";
        for (int j = 0; j < weightLookupTable.getVocabCache().words().size(); ++j) {
            String label =  weightLookupTable.getVocabCache().wordAtIndex(j);
            row += label + " ";
            int freq = weightLookupTable.getVocabCache().wordFrequency(label);
            int rows = ((InMemoryLookupTable)weightLookupTable).getSyn0().rows();
            int cols = ((InMemoryLookupTable)weightLookupTable).getSyn0().columns();
            row += freq + " " + rows + " " + cols + " ";

            for (int r = 0; r < rows; ++r) {
                //row += " ";
                for (int c = 0; c < cols; ++c) {
                    row += ((InMemoryLookupTable) weightLookupTable).getSyn0().getDouble(r, c) + " ";
                }
                //row += " ";
            }
            row += "\n";
        }
        writer.write(row);
    }
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

4 votes

@Test
public void ParaVec_Correct_WhenDeserialized() {

    INDArray syn0 = Nd4j.rand(DataType.FLOAT, 10, 2),
            syn1 = Nd4j.rand(DataType.FLOAT, 10, 2),
            syn1Neg = Nd4j.rand(DataType.FLOAT, 10, 2);

    InMemoryLookupTable<VocabWord> lookupTable = new InMemoryLookupTable
            .Builder<VocabWord>()
            .useAdaGrad(false)
            .cache(cache)
            .build();

    lookupTable.setSyn0(syn0);
    lookupTable.setSyn1(syn1);
    lookupTable.setSyn1Neg(syn1Neg);

    ParagraphVectors paragraphVectors = new ParagraphVectors.Builder()
            .vocabCache(cache)
            .lookupTable(lookupTable)
            .build();

    Word2Vec deser = null;
    try {
        ByteArrayOutputStream baos = new ByteArrayOutputStream();
        WordVectorSerializer.writeWord2Vec(paragraphVectors, baos);
        byte[] bytesResult = baos.toByteArray();
        deser = WordVectorSerializer.readWord2Vec(new ByteArrayInputStream(bytesResult), true);
    } catch (Exception e) {
        log.error("",e);
        fail();
    }

    assertNotNull(paragraphVectors.getConfiguration());
    assertEquals(paragraphVectors.getConfiguration(), deser.getConfiguration());

    assertEquals(cache.totalWordOccurrences(),deser.vocab().totalWordOccurrences());
    assertEquals(cache.totalNumberOfDocs(), deser.vocab().totalNumberOfDocs());
    assertEquals(cache.numWords(), deser.vocab().numWords());

    for (int i = 0; i < cache.words().size(); ++i) {
        val cached = cache.wordAtIndex(i);
        val restored = deser.vocab().wordAtIndex(i);
        assertNotNull(cached);
        assertEquals(cached, restored);
    }

}

Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0

4 votes

@Test(timeout = 300000)
public void testParagraphVectorsDBOW() throws Exception {
    skipUnlessIntegrationTests();

    File file = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(file);

    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    LabelsSource source = new LabelsSource("DOC_");

    ParagraphVectors vec = new ParagraphVectors.Builder().minWordFrequency(1).iterations(5).seed(119).epochs(1)
                    .layerSize(100).learningRate(0.025).labelsSource(source).windowSize(5).iterate(iter)
                    .trainWordVectors(true).vocabCache(cache).tokenizerFactory(t).negativeSample(0)
                    .allowParallelTokenization(true).useHierarchicSoftmax(true).sampling(0).workers(4)
                    .usePreciseWeightInit(true).sequenceLearningAlgorithm(new DBOW<VocabWord>()).build();

    vec.fit();

    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn0().isAttached());
    assertFalse(((InMemoryLookupTable<VocabWord>)vec.getLookupTable()).getSyn1().isAttached());

    int cnt1 = cache.wordFrequency("day");
    int cnt2 = cache.wordFrequency("me");

    assertNotEquals(1, cnt1);
    assertNotEquals(1, cnt2);
    assertNotEquals(cnt1, cnt2);

    double simDN = vec.similarity("day", "night");
    log.info("day/night similariry: {}", simDN);

    double similarity1 = vec.similarity("DOC_9835", "DOC_12492");
    log.info("9835/12492 similarity: " + similarity1);
    //        assertTrue(similarity1 > 0.2d);

    double similarity2 = vec.similarity("DOC_3720", "DOC_16392");
    log.info("3720/16392 similarity: " + similarity2);
    //      assertTrue(similarity2 > 0.2d);

    double similarity3 = vec.similarity("DOC_6347", "DOC_3720");
    log.info("6347/3720 similarity: " + similarity3);
    //        assertTrue(similarity3 > 0.6d);

    double similarityX = vec.similarity("DOC_3720", "DOC_9852");
    log.info("3720/9852 similarity: " + similarityX);
    assertTrue(similarityX < 0.5d);


    // testing DM inference now

    INDArray original = vec.getWordVectorMatrix("DOC_16392").dup();
    INDArray inferredA1 = vec.inferVector("This is my work");
    INDArray inferredB1 = vec.inferVector("This is my work .");
    INDArray inferredC1 = vec.inferVector("This is my day");
    INDArray inferredD1 = vec.inferVector("This is my night");

    log.info("A: {}", Arrays.toString(inferredA1.data().asFloat()));
    log.info("C: {}", Arrays.toString(inferredC1.data().asFloat()));

    assertNotEquals(inferredA1, inferredC1);

    double cosAO1 = Transforms.cosineSim(inferredA1.dup(), original.dup());
    double cosAB1 = Transforms.cosineSim(inferredA1.dup(), inferredB1.dup());
    double cosAC1 = Transforms.cosineSim(inferredA1.dup(), inferredC1.dup());
    double cosCD1 = Transforms.cosineSim(inferredD1.dup(), inferredC1.dup());

    log.info("Cos O/A: {}", cosAO1);
    log.info("Cos A/B: {}", cosAB1);
    log.info("Cos A/C: {}", cosAC1);
    log.info("Cos C/D: {}", cosCD1);

}

Source File: CBOW.java From deeplearning4j with Apache License 2.0

4 votes

public void iterateSample(T currentWord, int[] windowWords, boolean[] wordStatuses, AtomicLong nextRandom, double alpha,
                          boolean isInference, int numLabels, boolean trainWords, INDArray inferenceVector) {
    int[] idxSyn1 = null;
    byte[] codes = null;

    if (configuration.isUseHierarchicSoftmax()) {
        idxSyn1 = new int[currentWord.getCodeLength()];
        codes = new byte[currentWord.getCodeLength()];
        for (int p = 0; p < currentWord.getCodeLength(); p++) {
            if (currentWord.getPoints().get(p) < 0)
                continue;

            codes[p] = currentWord.getCodes().get(p);
            idxSyn1[p] = currentWord.getPoints().get(p);
        }
    } else {
        idxSyn1 = new int[0];
        codes = new byte[0];
    }


    if (negative > 0) {
        if (syn1Neg == null) {
            ((InMemoryLookupTable<T>) lookupTable).initNegative();
            syn1Neg = new DeviceLocalNDArray(((InMemoryLookupTable<T>) lookupTable).getSyn1Neg());
        }
    }

    if (batches.get() == null)
        batches.set(new ArrayList<Aggregate>());

    /*AggregateCBOW(syn0.get(), syn1.get(), syn1Neg.get(), expTable.get(), table.get(),
            currentWord.getIndex(), windowWords, idxSyn1, codes, (int) negative, currentWord.getIndex(),
            lookupTable.layerSize(), alpha, nextRandom.get(), vocabCache.numWords(), numLabels, trainWords,
            inferenceVector);*/

    boolean useHS = configuration.isUseHierarchicSoftmax();
    boolean useNegative = configuration.getNegative() > 0;

    int[] inputStatuses = new int[windowWords.length];
    for (int i = 0; i < windowWords.length; ++i) {
        if (i < wordStatuses.length)
            inputStatuses[i] = wordStatuses[i] ? 1 : 0;
        else
            inputStatuses[i] = -1;
    }
    INDArray wordsStatuses = Nd4j.createFromArray(inputStatuses);

    CbowRound cbow = null;

    if (useHS && useNegative) {
        cbow = new CbowRound(Nd4j.scalar(currentWord.getIndex()), Nd4j.createFromArray(windowWords),
                wordsStatuses,
                Nd4j.scalar(currentWord.getIndex()),
                syn0.get(), syn1.get(), syn1Neg.get(),
                expTable.get(), table.get(), Nd4j.createFromArray(idxSyn1), Nd4j.createFromArray(codes),
                (int)negative, Nd4j.scalar(alpha), Nd4j.scalar(nextRandom.get()),
                inferenceVector != null ? inferenceVector : Nd4j.empty(syn0.get().dataType()),
                Nd4j.empty(DataType.INT),
                trainWords,
                workers);
    }
    else if (useHS) {
        cbow = new CbowRound(currentWord.getIndex(), windowWords, wordsStatuses.toIntVector(),
                syn0.get(), syn1.get(),
                expTable.get(), idxSyn1, codes, alpha, nextRandom.get(),
                inferenceVector != null ? inferenceVector : Nd4j.empty(syn0.get().dataType()), 0);
    }
    else if (useNegative) {
        cbow = new CbowRound(currentWord.getIndex(), windowWords, wordsStatuses.toIntVector(), currentWord.getIndex(),
                syn0.get(), syn1Neg.get(),
                expTable.get(), table.get(), (int)negative, alpha, nextRandom.get(),
                inferenceVector != null ? inferenceVector : Nd4j.empty(syn0.get().dataType()), 0);
    }

    nextRandom.set(Math.abs(nextRandom.get() * 25214903917L + 11));
    Nd4j.getExecutioner().exec(cbow);

    /*if (!isInference) {
        batches.get().add(cbow);
        if (batches.get().size() > 4096) {
            Nd4j.getExecutioner().exec(batches.get());
            batches.get().clear();
        }
    } else
        Nd4j.getExecutioner().exec(cbow);*/

}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

4 votes

@Test
public void weightLookupTable_Correct_WhenDeserialized() throws Exception {

    INDArray syn0 = Nd4j.rand(DataType.FLOAT, 10, 2),
            syn1 = Nd4j.rand(DataType.FLOAT, 10, 2),
            syn1Neg = Nd4j.rand(DataType.FLOAT, 10, 2);

    InMemoryLookupTable<VocabWord> lookupTable = new InMemoryLookupTable
            .Builder<VocabWord>()
            .useAdaGrad(false)
            .cache(cache)
            .build();

    lookupTable.setSyn0(syn0);
    lookupTable.setSyn1(syn1);
    lookupTable.setSyn1Neg(syn1Neg);

    File dir = testDir.newFolder();
    File file = new File(dir, "lookupTable.txt");

    WeightLookupTable<VocabWord> deser = null;
    try {
        WordVectorSerializer.writeLookupTable(lookupTable, file);
        deser = WordVectorSerializer.readLookupTable(file);
    } catch (Exception e) {
        log.error("",e);
        fail();
    }
    assertEquals(lookupTable.getVocab().totalWordOccurrences(), ((InMemoryLookupTable<VocabWord>)deser).getVocab().totalWordOccurrences());
    assertEquals(cache.totalNumberOfDocs(), ((InMemoryLookupTable<VocabWord>)deser).getVocab().totalNumberOfDocs());
    assertEquals(cache.numWords(), ((InMemoryLookupTable<VocabWord>)deser).getVocab().numWords());

    for (int i = 0; i < cache.words().size(); ++i) {
        val cached = cache.wordAtIndex(i);
        val restored = ((InMemoryLookupTable<VocabWord>)deser).getVocab().wordAtIndex(i);
        assertNotNull(cached);
        assertEquals(cached, restored);
    }

    assertEquals(lookupTable.getSyn0().columns(), ((InMemoryLookupTable<VocabWord>) deser).getSyn0().columns());
    assertEquals(lookupTable.getSyn0().rows(), ((InMemoryLookupTable<VocabWord>) deser).getSyn0().rows());
    for (int c = 0; c < ((InMemoryLookupTable<VocabWord>) deser).getSyn0().columns(); ++c) {
        for (int r = 0; r < ((InMemoryLookupTable<VocabWord>) deser).getSyn0().rows(); ++r) {
            assertEquals(lookupTable.getSyn0().getDouble(r,c),
                        ((InMemoryLookupTable<VocabWord>) deser).getSyn0().getDouble(r,c), 1e-5);
        }
    }
}

Source File: TsneTest.java From deeplearning4j with Apache License 2.0

4 votes

@Test
public void testSimple() throws Exception {
    //Simple sanity check

    for( int test=0; test <=1; test++){
        boolean syntheticData = test == 1;
        WorkspaceMode wsm = test == 0 ? WorkspaceMode.NONE : WorkspaceMode.ENABLED;
        log.info("Starting test: WSM={}, syntheticData={}", wsm, syntheticData);

        //STEP 1: Initialization
        int iterations = 50;
        //create an n-dimensional array of doubles
        Nd4j.setDefaultDataTypes(DataType.FLOAT, DataType.FLOAT);
        List<String> cacheList = new ArrayList<>(); //cacheList is a dynamic array of strings used to hold all words

        //STEP 2: Turn text input into a list of words
        INDArray weights;
        if(syntheticData){
            weights = Nd4j.rand(250, 200);
        } else {
            log.info("Load & Vectorize data....");
            File wordFile = new ClassPathResource("deeplearning4j-tsne/words.txt").getFile();   //Open the file
            //Get the data of all unique word vectors
            Pair<InMemoryLookupTable, VocabCache> vectors = WordVectorSerializer.loadTxt(wordFile);
            VocabCache cache = vectors.getSecond();
            weights = vectors.getFirst().getSyn0();    //seperate weights of unique words into their own list

            for (int i = 0; i < cache.numWords(); i++)   //seperate strings of words into their own list
                cacheList.add(cache.wordAtIndex(i));
        }

        //STEP 3: build a dual-tree tsne to use later
        log.info("Build model....");
        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                .setMaxIter(iterations)
                .theta(0.5)
                .normalize(false)
                .learningRate(500)
                .useAdaGrad(false)
                .workspaceMode(wsm)
                .build();


        //STEP 4: establish the tsne values and save them to a file
        log.info("Store TSNE Coordinates for Plotting....");
        File outDir = testDir.newFolder();
        tsne.fit(weights);
        tsne.saveAsFile(cacheList, new File(outDir, "out.txt").getAbsolutePath());
    }
}

Source File: TsneTest.java From deeplearning4j with Apache License 2.0

4 votes

@Test
public void testPerformance() throws Exception {

    StopWatch watch = new StopWatch();
    watch.start();
    for( int test=0; test <=1; test++){
        boolean syntheticData = test == 1;
        WorkspaceMode wsm = test == 0 ? WorkspaceMode.NONE : WorkspaceMode.ENABLED;
        log.info("Starting test: WSM={}, syntheticData={}", wsm, syntheticData);

        //STEP 1: Initialization
        int iterations = 50;
        //create an n-dimensional array of doubles
        Nd4j.setDefaultDataTypes(DataType.FLOAT, DataType.FLOAT);
        List<String> cacheList = new ArrayList<>(); //cacheList is a dynamic array of strings used to hold all words

        //STEP 2: Turn text input into a list of words
        INDArray weights;
        if(syntheticData){
            weights = Nd4j.rand(DataType.FLOAT, 250, 20);
        } else {
            log.info("Load & Vectorize data....");
            File wordFile = new ClassPathResource("deeplearning4j-tsne/words.txt").getFile();   //Open the file
            //Get the data of all unique word vectors
            Pair<InMemoryLookupTable, VocabCache> vectors = WordVectorSerializer.loadTxt(wordFile);
            VocabCache cache = vectors.getSecond();
            weights = vectors.getFirst().getSyn0();    //seperate weights of unique words into their own list

            for (int i = 0; i < cache.numWords(); i++)   //seperate strings of words into their own list
                cacheList.add(cache.wordAtIndex(i));
        }

        //STEP 3: build a dual-tree tsne to use later
        log.info("Build model....");
        BarnesHutTsne tsne = new BarnesHutTsne.Builder()
                .setMaxIter(iterations)
                .theta(0.5)
                .normalize(false)
                .learningRate(500)
                .useAdaGrad(false)
                .workspaceMode(wsm)
                .build();


        //STEP 4: establish the tsne values and save them to a file
        log.info("Store TSNE Coordinates for Plotting....");
        File outDir = testDir.newFolder();
        tsne.fit(weights);
        tsne.saveAsFile(cacheList, new File(outDir, "out.txt").getAbsolutePath());
    }
    watch.stop();
    System.out.println("Elapsed time : " + watch);
}

Source File: Word2VecPerformerVoid.java From deeplearning4j with Apache License 2.0

4 votes

public Word2VecPerformerVoid(SparkConf sc, Broadcast<AtomicLong> wordCount, InMemoryLookupTable weights) {
    this.weights = weights;
    this.wordCount = wordCount;
    setup(sc);
}

Source File: Word2VecPerformerVoid.java From deeplearning4j with Apache License 2.0

4 votes

public InMemoryLookupTable getWeights() {
    return weights;
}

Source File: Word2VecPerformerVoid.java From deeplearning4j with Apache License 2.0

4 votes

public void setWeights(InMemoryLookupTable weights) {
    this.weights = weights;
}

org.deeplearning4j.models.embeddings.inmemory.InMemoryLookupTable Java Examples