org.deeplearning4j.models.word2vec.Word2Vec Java Exaples

Source File: ManualTests.java From deeplearning4j with Apache License 2.0

6 votes

@Test(timeout = 300000)
public void testWord2VecPlot() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025)
                    .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10)
                    .tokenizerFactory(t).build();

    vec.fit();

    //        UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo();

    //        vec.getLookupTable().plotVocab(100, connectionInfo);

    Thread.sleep(10000000000L);
    fail("Not implemented");
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

6 votes

public static Word2Vec readAsBinaryNoLineBreaks(@NonNull InputStream inputStream) {
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();

    // try to load without linebreaks
    try {
        if (originalPeriodic) {
            Nd4j.getMemoryManager().togglePeriodicGc(true);
        }

        Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);

        return readBinaryModel(inputStream, false, false);
    } catch (Exception readModelException) {
        log.error("Cannot read binary model", readModelException);
        throw new RuntimeException("Unable to guess input file format. Please use corresponding loader directly");
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method loads Word2Vec model from binary input stream.
 *
 * @param inputStream  binary input stream
 * @return Word2Vec
 */
public static Word2Vec readAsBinary(@NonNull InputStream inputStream) {
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();

    // we fallback to trying binary model instead
    try {
        log.debug("Trying binary model restoration...");

        if (originalPeriodic) {
            Nd4j.getMemoryManager().togglePeriodicGc(true);
        }

        Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);

        return readBinaryModel(inputStream, true, false);
    } catch (Exception readModelException) {
        throw new RuntimeException(readModelException);
    }
}

Source File: ChineseTokenizerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Ignore
@Test
public void testFindNamesFromText() throws IOException {
    SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt");

    log.info("load is right!");
    TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory();
    //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer());

    //Generates a word-vector from the dataset stored in resources folder
    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42)
                    .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build();
    vec.fit();
    WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt"));

    //trains a model that can find out all names from news(Suffix txt),It uses word vector generated
    // WordVectors wordVectors;

    //test model,Whether the model find out name from unknow text;

}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method loads Word2Vec model from input stream
 *
 * @param stream InputStream
 * @param readExtendedTable boolean
 * @return Word2Vec
 */
public static Word2Vec readWord2Vec(
            @NonNull InputStream stream,
            boolean readExtendedTable) throws IOException {
    SequenceVectors<VocabWord> vectors = readSequenceVectors(stream, readExtendedTable);

    Word2Vec word2Vec = new Word2Vec
            .Builder(vectors.getConfiguration())
            .layerSize(vectors.getLayerSize())
            .build();
    word2Vec.setVocab(vectors.getVocab());
    word2Vec.setLookupTable(vectors.lookupTable());
    word2Vec.setModelUtils(vectors.getModelUtils());

    return word2Vec;
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true);
    WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
    INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
    assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
    assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
    assertTrue(wordVector1.length() == 300);
    assertTrue(wordVector2.length() == 300);
    assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
    assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}

Source File: Word2VecIteratorTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testLabeledExample() throws Exception {

    INDArray unk = vec.getWordVectorMatrix(Word2Vec.DEFAULT_UNK);
    assertNotEquals(null, unk);

    unk = vec.getWordVectorMatrix("2131241sdasdas");
    assertNotEquals(null, unk);

    ClassPathResource resource = new ClassPathResource("/labeled/");
    File dir = testDir.newFolder();
    resource.copyDirectory(dir);

    Word2VecDataSetIterator iter = new Word2VecDataSetIterator(vec,
                    new LabelAwareFileSentenceIterator(null, dir),
                    Arrays.asList("negative", "positive", "neutral"));
    DataSet next = iter.next();

}

Source File: Word2VecIteratorTest.java From deeplearning4j with Apache License 2.0

6 votes

@Before
public void before() throws Exception {
    if (vec == null) {
        ClassPathResource resource = new ClassPathResource("/labeled/");
        File dir = testDir.newFolder();
        resource.copyDirectory(dir);
        SentenceIterator iter = UimaSentenceIterator.createWithPath(dir.getAbsolutePath());
        new File("cache.ser").delete();

        TokenizerFactory t = new UimaTokenizerFactory();

        vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100)
                        .stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter)
                        .tokenizerFactory(t).build();
        vec.fit();

    }
}

Source File: VectorsConfigurationTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test(timeout = 300000)
public void testFromW2V() throws Exception {
    VectorsConfiguration configuration = new VectorsConfiguration();
    configuration.setHugeModelExpected(true);
    configuration.setWindow(5);
    configuration.setIterations(3);
    configuration.setLayersSize(200);
    configuration.setLearningRate(1.4d);
    configuration.setSampling(0.0005d);
    configuration.setMinLearningRate(0.25d);
    configuration.setEpochs(1);

    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());

    Word2Vec vec = new Word2Vec.Builder(configuration).iterate(iter).build();

    VectorsConfiguration configuration2 = vec.getConfiguration();

    assertEquals(configuration, configuration2);
}

Source File: PerformanceTests.java From deeplearning4j with Apache License 2.0

6 votes

@Ignore
@Test
public void testWord2VecCBOWBig() throws Exception {
    SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt");
    //iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt");
    //SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt");

    TokenizerFactory t = new KoreanTokenizerFactory();
    //t = new DefaultTokenizerFactory();
    //t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150)
                    .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5)
                    .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8)
                    .allowParallelTokenization(true).tokenizerFactory(t)
                    .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

    long time1 = System.currentTimeMillis();

    vec.fit();

    long time2 = System.currentTimeMillis();

    log.info("Total execution time: {}", (time2 - time1));
}

Source File: Word2VecCN.java From word2vec with Apache License 2.0

6 votes

public Word2Vec fit() {
  log.info("Building model....");
  Word2Vec vec =
      new Word2Vec.Builder()
          .minWordFrequency(minWordFrequency)
          .iterations(iterations)
          .layerSize(layerSize)
          .seed(seed)
          .windowSize(windowSize)
          .iterate(sentenceIterator)
          .tokenizerFactory(tokenizerFactory)
          .build();

  log.info("Fitting Word2Vec model....");
  vec.fit();
  return vec;
}

Source File: WindowConverter.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Converts a window (each word in the window)
 *
 * in to a vector.
 *
 * Keep in mind each window is a multi word context.
 *
 * From there, each word uses the passed in model
 * as a lookup table to get what vectors are relevant
 * to the passed in windows
 * @param window the window to take in.
 * @param vec the model to use as a lookup table
 * @return a concacneated 1 row array
 * containing all of the numbers for each word in the window
 */
public static INDArray asExampleArray(Window window, Word2Vec vec, boolean normalize) {
    int length = vec.lookupTable().layerSize();
    List<String> words = window.getWords();
    int windowSize = vec.getWindow();
    Preconditions.checkState(words.size() == vec.getWindow());
    INDArray ret = Nd4j.create(1, length * windowSize);



    for (int i = 0; i < words.size(); i++) {
        String word = words.get(i);
        INDArray n = normalize ? vec.getWordVectorMatrixNormalized(word) : vec.getWordVectorMatrix(word);
        ret.put(new INDArrayIndex[] {NDArrayIndex.interval(i * vec.lookupTable().layerSize(),
                        i * vec.lookupTable().layerSize() + vec.lookupTable().layerSize())}, n);
    }

    return ret;
}

Source File: FastTextTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testWordsStatistics() throws IOException {
    File output = testDir.newFile();

    FastText fastText = FastText
            .builder()
            .supervised(true)
            .inputFile(inputFile.getAbsolutePath())
            .outputFile(output.getAbsolutePath())
            .build();

    log.info("\nTraining supervised model ...\n");
    fastText.fit();

    File file = new File(output.getAbsolutePath() + ".vec");
    Word2Vec word2Vec = WordVectorSerializer.readAsCsv(file);

    assertEquals(48, word2Vec.getVocab().numWords());
    assertEquals("", 0.1667751520872116, word2Vec.similarity("Football", "teams"), 2e-3);
    assertEquals("", 0.10083991289138794, word2Vec.similarity("professional", "minutes"), 2e-3);
    assertEquals("", Double.NaN, word2Vec.similarity("java","cpp"), 0.0);
    assertThat(word2Vec.wordsNearest("association", 3), hasItems("Football", "Soccer", "men's"));
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method just loads full compressed model.
 */
private static Word2Vec readAsExtendedModel(@NonNull File file) throws IOException {
    int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency();
    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();

    log.debug("Trying full model restoration...");

    if (originalPeriodic) {
        Nd4j.getMemoryManager().togglePeriodicGc(true);
    }

    Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq);

    return readWord2Vec(file);
}

Source File: GoogleNewsVectorExample.java From Java-Deep-Learning-Cookbook with MIT License

5 votes

public static void main(String[] args) {
    try{
        File file = new File("{PATH-TO-GOOGLE-WORD-VECTOR}");
        Word2Vec model = WordVectorSerializer.readWord2VecModel(file);
        System.out.println(Arrays.asList(model.wordsNearest("season",10)));
    } catch(ND4JIllegalStateException e){
        System.out.println("Please provide proper directory path in place of: PATH-TO-GOOGLE-WORD-VECTOR");
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method
 * 1) Binary model, either compressed or not. Like well-known Google Model
 * 2) Popular CSV word2vec text format
 * 3) DL4j compressed format
 * <p>
 * Please note: if extended data isn't available, only weights will be loaded instead.
 *
 * @param file  model file
 * @param extendedModel  if TRUE, we'll try to load HS states & Huffman tree info, if FALSE, only weights will be loaded
 * @return word2vec model
 */
public static Word2Vec readWord2VecModel(File file, boolean extendedModel) {
    if (!file.exists() || !file.isFile()) {
        throw new ND4JIllegalStateException("File [" + file.getAbsolutePath() + "] doesn't exist");
    }

    boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive();
    if (originalPeriodic) {
        Nd4j.getMemoryManager().togglePeriodicGc(false);
    }
    Nd4j.getMemoryManager().setOccasionalGcFrequency(50000);

    try {
        return readWord2Vec(file, extendedModel);
    } catch (Exception readSequenceVectors) {
        try {
            return extendedModel
                    ? readAsExtendedModel(file)
                    : readAsSimplifiedModel(file);
        } catch (Exception loadFromFileException) {
            try {
                return readAsCsv(file);
            } catch (Exception readCsvException) {
                try {
                    return readAsBinary(file);
                } catch (Exception readBinaryException) {
                    try {
                        return readAsBinaryNoLineBreaks(file);
                    } catch (Exception readModelException) {
                        log.error("Unable to guess input file format", readModelException);
                        throw new RuntimeException("Unable to guess input file format. Please use corresponding loader directly");
                    }
                }
            }
        }
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

public static Word2Vec readAsBinaryNoLineBreaks(@NonNull File file) {
    try (InputStream inputStream = fileStream(file)) {
        return readAsBinaryNoLineBreaks(inputStream);
    } catch (IOException readCsvException) {
        throw new RuntimeException(readCsvException);
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Write the tsne format
 *
 * @param vec  the word vectors to use for labeling
 * @param tsne the tsne array to write
 * @param csv  the file to use
 * @throws Exception
 */
public static void writeTsneFormat(Word2Vec vec, INDArray tsne, File csv) throws Exception {
    try (BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv), StandardCharsets.UTF_8))) {
        int words = 0;
        InMemoryLookupCache l = (InMemoryLookupCache) vec.vocab();
        for (String word : vec.vocab().words()) {
            if (word == null) {
                continue;
            }
            StringBuilder sb = new StringBuilder();
            INDArray wordVector = tsne.getRow(l.wordFor(word).getIndex());
            for (int j = 0; j < wordVector.length(); j++) {
                sb.append(wordVector.getDouble(j));
                if (j < wordVector.length() - 1) {
                    sb.append(",");
                }
            }
            sb.append(",");
            sb.append(word.replaceAll(" ", WHITESPACE_REPLACEMENT));
            sb.append(" ");

            sb.append("\n");
            write.write(sb.toString());

        }

        log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

public static Word2Vec readAsBinary(@NonNull File file) {
    try (InputStream inputStream = fileStream(file)) {
        return readAsBinary(inputStream);
    } catch (IOException readCsvException) {
        throw new RuntimeException(readCsvException);
    }
}

Source File: Word2VecDataSetIteratorTest.java From deeplearning4j with Apache License 2.0

5 votes

/**
     * Basically all we want from this test - being able to finish without exceptions.
     */
    @Test
    public void testIterator1() throws Exception {

        File inputFile = Resources.asFile("big/raw_sentences.txt");
        SentenceIterator iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile);
//        SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath());

        TokenizerFactory t = new DefaultTokenizerFactory();
        t.setTokenPreProcessor(new CommonPreprocessor());

        Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10) // we make sure we'll have some missing words
                        .iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0)
                        .useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>())
                        .useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t)
                        .elementsLearningAlgorithm(new CBOW<VocabWord>()).build();

        vec.fit();

        List<String> labels = new ArrayList<>();
        labels.add("positive");
        labels.add("negative");

        Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1);
        INDArray array = iterator.next().getFeatures();
        int count = 0;
        while (iterator.hasNext()) {
            DataSet ds = iterator.next();

            assertArrayEquals(array.shape(), ds.getFeatures().shape());

            if(!isIntegrationTests() && count++ > 20)
                break;  //raw_sentences.txt is 2.81 MB, takes quite some time to process. We'll only first 20 minibatches when doing unit tests
        }
    }

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

public static Word2Vec readAsCsv(@NonNull File file) {
    try (InputStream inputStream = fileStream(file)) {
        return readAsCsv(inputStream);
    } catch (IOException readCsvException) {
        throw new RuntimeException(readCsvException);
    }
}

Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test(timeout = 300000)
public void testJSONSerialization() {
    ParagraphVectors paragraphVectors = new ParagraphVectors.Builder().build();
    AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build();

    val words = new VocabWord[3];
    words[0] = new VocabWord(1.0, "word");
    words[1] = new VocabWord(2.0, "test");
    words[2] = new VocabWord(3.0, "tester");

    for (int i = 0; i < words.length; ++i) {
        cache.addToken(words[i]);
        cache.addWordToIndex(i, words[i].getLabel());
    }
    paragraphVectors.setVocab(cache);

    String json = null;
    Word2Vec unserialized = null;
    try {
        json = paragraphVectors.toJson();
        log.info("{}", json.toString());

        unserialized = ParagraphVectors.fromJson(json);
    } catch (Exception e) {
        log.error("",e);
        fail();
    }

    assertEquals(cache.totalWordOccurrences(), ((ParagraphVectors) unserialized).getVocab().totalWordOccurrences());
    assertEquals(cache.totalNumberOfDocs(), ((ParagraphVectors) unserialized).getVocab().totalNumberOfDocs());

    for (int i = 0; i < words.length; ++i) {
        val cached = cache.wordAtIndex(i);
        val restored = ((ParagraphVectors) unserialized).getVocab().wordAtIndex(i);
        assertNotNull(cached);
        assertEquals(cached, restored);
    }
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method loads Word2Vec model from csv file
 *
 * @param inputStream  input stream
 * @return Word2Vec model
 */
public static Word2Vec readAsCsv(@NonNull InputStream inputStream) {
    VectorsConfiguration configuration = new VectorsConfiguration();

    // let's try to load this file as csv file
    try {
        log.debug("Trying CSV model restoration...");

        Pair<InMemoryLookupTable, VocabCache> pair = loadTxt(inputStream);
        Word2Vec.Builder builder = new Word2Vec
                .Builder()
                .lookupTable(pair.getFirst())
                .useAdaGrad(false)
                .vocabCache(pair.getSecond())
                .layerSize(pair.getFirst().layerSize())
                // we don't use hs here, because model is incomplete
                .useHierarchicSoftmax(false)
                .resetModel(false);

        TokenizerFactory factory = getTokenizerFactory(configuration);
        if (factory != null) {
            builder.tokenizerFactory(factory);
        }

        return builder.build();
    } catch (Exception ex) {
        throw new RuntimeException("Unable to load model in CSV format");
    }
}

Source File: Word2VecDataFetcher.java From deeplearning4j with Apache License 2.0

5 votes

public Word2VecDataFetcher(String path, Word2Vec vec, List<String> labels) {
    if (vec == null || labels == null || labels.isEmpty())
        throw new IllegalArgumentException(
                        "Unable to initialize due to missing argument or empty label applyTransformToDestination");
    this.vec = vec;
    this.labels = labels;
    this.path = path;
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method saves Word2Vec model to output stream
 *
 * @param word2Vec Word2Vec
 * @param stream OutputStream
 */
public static void writeWord2Vec(@NonNull Word2Vec word2Vec, @NonNull OutputStream stream)
        throws IOException {

    SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(word2Vec.getConfiguration())
            .layerSize(word2Vec.getLayerSize()).build();
    vectors.setVocab(word2Vec.getVocab());
    vectors.setLookupTable(word2Vec.getLookupTable());
    vectors.setModelUtils(word2Vec.getModelUtils());
    writeSequenceVectors(vectors, stream);
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method loads Word2Vec model from file
 *
 * @param file File
 * @param readExtendedTables boolean
 * @return Word2Vec
 */
public static Word2Vec readWord2Vec(@NonNull File file, boolean readExtendedTables) {
    try (InputStream inputStream = fileStream(file)) {
        return readWord2Vec(inputStream, readExtendedTables);
    } catch (Exception readSequenceVectors) {
        throw new RuntimeException(readSequenceVectors);
    }
}

Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
@Ignore //AB 2020/02/06 - https://github.com/eclipse/deeplearning4j/issues/8677
public void testDirectInference() throws Exception {
    boolean isIntegration = isIntegrationTests();
    File resource = Resources.asFile("/big/raw_sentences.txt");
    SentenceIterator sentencesIter = getIterator(isIntegration, resource);

    ClassPathResource resource_mixed = new ClassPathResource("paravec/");
    File local_resource_mixed = testDir.newFolder();
    resource_mixed.copyDirectory(local_resource_mixed);
    SentenceIterator iter = new AggregatingSentenceIterator.Builder()
                    .addSentenceIterator(sentencesIter)
                    .addSentenceIterator(new FileSentenceIterator(local_resource_mixed)).build();

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(1)
                    .learningRate(0.025).layerSize(150).minLearningRate(0.001)
                    .elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    wordVectors.fit();

    ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10)
                    .useHierarchicSoftmax(true).trainWordVectors(true).useExistingWordVectors(wordVectors)
                    .negativeSample(0).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}

Source File: WordConverter.java From deeplearning4j with Apache License 2.0

5 votes

public static INDArray toInputMatrix(List<Window> windows, Word2Vec vec) {
    int columns = vec.lookupTable().layerSize() * vec.getWindow();
    int rows = windows.size();
    INDArray ret = Nd4j.create(rows, columns);
    for (int i = 0; i < rows; i++) {
        ret.putRow(i, WindowConverter.asExampleMatrix(windows.get(i), vec));
    }
    return ret;
}

Source File: InMemoryLookupTable.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * @param word
 * @return
 */
@Override
public INDArray vector(String word) {
    if (word == null)
        return null;
    int idx = vocab.indexOf(word);
    if (idx < 0) {
        idx = vocab.indexOf(Word2Vec.DEFAULT_UNK);
        if (idx < 0)
            return null;
    }
    return syn0.getRow(idx, true);
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Writes the word vectors to the given BufferedWriter. Note that this assumes an in memory cache.
 * BufferedWriter can be writer to local file, or hdfs file, or any compatible to java target.
 *
 * @param vec    the word2vec to write
 * @param writer - BufferedWriter, where all data should be written to
 *               the path to write
 * @deprecated Use {@link #writeWord2Vec(Word2Vec, OutputStream)}
 */
@Deprecated
public static void writeWordVectors(@NonNull Word2Vec vec, @NonNull BufferedWriter writer) throws IOException {
    int words = 0;

    String str = vec.getVocab().numWords() + " " + vec.getLayerSize() + " " + vec.getVocab().totalNumberOfDocs();
    log.debug("Saving header: {}", str);
    writer.write(str + "\n");

    for (String word : vec.vocab().words()) {
        if (word == null) {
            continue;
        }
        StringBuilder sb = new StringBuilder();
        sb.append(word.replaceAll(" ", WHITESPACE_REPLACEMENT));
        sb.append(" ");
        INDArray wordVector = vec.getWordVectorMatrix(word);
        for (int j = 0; j < wordVector.length(); j++) {
            sb.append(wordVector.getDouble(j));
            if (j < wordVector.length() - 1) {
                sb.append(" ");
            }
        }
        sb.append("\n");
        writer.write(sb.toString());
        words++;
    }

    try {
        writer.flush();
    } catch (Exception e) {
    }
    log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize());
}

org.deeplearning4j.models.word2vec.Word2Vec Java Examples