org.deeplearning4j.models.embeddings.wordvectors.WordVectors Java Exaples

Source File: CnnSentenceDataSetIterator.java From wekaDeeplearning4j with GNU General Public License v3.0

6 votes

/**
 * Constructor that uses {@link Builder} extended with stopwords.
 *
 * @param builder Builder
 */
protected CnnSentenceDataSetIterator(CnnSentenceDataSetIterator.Builder builder) {
  super(builder);
  this.stopwords = builder.stopwords;
  setUnknownWordHandling(UnknownWordHandling.UseUnknownVector);

  // Set unknown word
  WordVectors wordVectors = getWordVectors();
  wordVectors.setUNK("UNKNOWN");

  // Initialize unknown word manually
  INDArray unknown;
  if (getUseNormalizedWordVectors()) {
    unknown = wordVectors.getWordVectorMatrixNormalized(wordVectors.getUNK());
  } else {
    unknown = wordVectors.getWordVectorMatrix(wordVectors.getUNK());
  }
  setUnknown(unknown);
}

Source File: RnnTextEmbeddingDataSetIterator.java From wekaDeeplearning4j with GNU General Public License v3.0

6 votes

/**
 * Constructor with necessary objects to create RNN features.
 *
 * @param data Instances with documents and labels
 * @param wordVectors WordVectors object
 * @param tokenFact Tokenizer factory
 * @param tpp Token pre processor
 * @param stopWords Stop word object
 * @param batchSize Size of each minibatch for training
 * @param truncateLength If reviews exceed
 */
public RnnTextEmbeddingDataSetIterator(
    Instances data,
    WordVectors wordVectors,
    TokenizerFactory tokenFact,
    TokenPreProcess tpp,
    AbstractStopwords stopWords,
    LabeledSentenceProvider sentenceProvider,
    int batchSize,
    int truncateLength) {
  this.batchSize = batchSize;
  this.wordVectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;
  this.data = data;
  this.wordVectors = wordVectors;
  this.truncateLength = truncateLength;
  this.tokenizerFactory = tokenFact;
  this.tokenizerFactory.getBackend().setTokenPreProcessor(tpp.getBackend());
  this.stopWords = stopWords;
  this.sentenceProvider = sentenceProvider;
}

Source File: CnnSentenceDataSetIterator.java From wekaDeeplearning4j with GNU General Public License v3.0

6 votes

/**
 * Constructor that uses {@link Builder} extended with stopwords.
 *
 * @param builder Builder
 */
protected CnnSentenceDataSetIterator(CnnSentenceDataSetIterator.Builder builder) {
  super(builder);
  this.stopwords = builder.stopwords;
  setUnknownWordHandling(UnknownWordHandling.UseUnknownVector);

  // Set unknown word
  WordVectors wordVectors = getWordVectors();
  wordVectors.setUNK("UNKNOWN");

  // Initialize unknown word manually
  INDArray unknown;
  if (getUseNormalizedWordVectors()) {
    unknown = wordVectors.getWordVectorMatrixNormalized(wordVectors.getUNK());
  } else {
    unknown = wordVectors.getWordVectorMatrix(wordVectors.getUNK());
  }
  setUnknown(unknown);
}

Source File: CnnWord2VecSentenceClassificationExample.java From Java-Deep-Learning-Cookbook with MIT License

6 votes

private static DataSetIterator getDataSetIterator(boolean isTraining, WordVectors wordVectors, int minibatchSize,
                                                  int maxSentenceLength, Random rng ){
    String path = FilenameUtils.concat(DATA_PATH, (isTraining ? "aclImdb/train/" : "aclImdb/test/"));
    String positiveBaseDir = FilenameUtils.concat(path, "pos");
    String negativeBaseDir = FilenameUtils.concat(path, "neg");

    File filePositive = new File(positiveBaseDir);
    File fileNegative = new File(negativeBaseDir);

    Map<String,List<File>> reviewFilesMap = new HashMap<>();
    reviewFilesMap.put("Positive", Arrays.asList(filePositive.listFiles()));
    reviewFilesMap.put("Negative", Arrays.asList(fileNegative.listFiles()));

    LabeledSentenceProvider sentenceProvider = new FileLabeledSentenceProvider(reviewFilesMap, rng);

    return new CnnSentenceDataSetIterator.Builder(CnnSentenceDataSetIterator.Format.CNN2D)
            .sentenceProvider(sentenceProvider)
            .wordVectors(wordVectors)
            .minibatchSize(minibatchSize)
            .maxSentenceLength(maxSentenceLength)
            .useNormalizedWordVectors(false)
            .build();
}

Source File: CnnWord2VecSentenceClassificationExample.java From Java-Deep-Learning-Cookbook with MIT License

6 votes

private static DataSetIterator getDataSetIterator(boolean isTraining, WordVectors wordVectors, int minibatchSize,
                                                  int maxSentenceLength, Random rng ){
    String path = FilenameUtils.concat(DATA_PATH, (isTraining ? "aclImdb/train/" : "aclImdb/test/"));
    String positiveBaseDir = FilenameUtils.concat(path, "pos");
    String negativeBaseDir = FilenameUtils.concat(path, "neg");

    File filePositive = new File(positiveBaseDir);
    File fileNegative = new File(negativeBaseDir);

    Map<String,List<File>> reviewFilesMap = new HashMap<>();
    reviewFilesMap.put("Positive", Arrays.asList(filePositive.listFiles()));
    reviewFilesMap.put("Negative", Arrays.asList(fileNegative.listFiles()));

    LabeledSentenceProvider sentenceProvider = new FileLabeledSentenceProvider(reviewFilesMap, rng);

    return new CnnSentenceDataSetIterator.Builder(CnnSentenceDataSetIterator.Format.CNN2D)
            .sentenceProvider(sentenceProvider)
            .wordVectors(wordVectors)
            .minibatchSize(minibatchSize)
            .maxSentenceLength(maxSentenceLength)
            .useNormalizedWordVectors(false)
            .build();
}

Source File: RnnTextEmbeddingDataSetIterator.java From wekaDeeplearning4j with GNU General Public License v3.0

6 votes

/**
 * Constructor with necessary objects to create RNN features.
 *
 * @param data Instances with documents and labels
 * @param wordVectors WordVectors object
 * @param tokenFact Tokenizer factory
 * @param tpp Token pre processor
 * @param stopWords Stop word object
 * @param batchSize Size of each minibatch for training
 * @param truncateLength If reviews exceed
 */
public RnnTextEmbeddingDataSetIterator(
    Instances data,
    WordVectors wordVectors,
    TokenizerFactory tokenFact,
    TokenPreProcess tpp,
    AbstractStopwords stopWords,
    LabeledSentenceProvider sentenceProvider,
    int batchSize,
    int truncateLength) {
  this.batchSize = batchSize;
  this.wordVectorSize = wordVectors.getWordVector(wordVectors.vocab().wordAtIndex(0)).length;
  this.data = data;
  this.wordVectors = wordVectors;
  this.truncateLength = truncateLength;
  this.tokenizerFactory = tokenFact;
  this.tokenizerFactory.getBackend().setTokenPreProcessor(tpp.getBackend());
  this.stopWords = stopWords;
  this.sentenceProvider = sentenceProvider;
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
@Ignore
public void testWriteWordVectors() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();
    WordVectorSerializer.writeWordVectors(lookupTable, lookupCache, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
@Ignore
public void testWriteWordVectorsFromWord2Vec() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true);
    WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto);

    WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto));
    INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman");
    INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano");
    assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1);
    assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2);
    assertTrue(wordVector1.length() == 300);
    assertTrue(wordVector2.length() == 300);
    assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3);
    assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3);
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
@Ignore
public void testFromTableAndVocab() throws IOException {

    WordVectors vec = WordVectorSerializer.readWord2VecModel(textFile);
    InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable();
    InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab();

    WordVectors wordVectors = WordVectorSerializer.fromTableAndVocab(lookupTable, lookupCache);
    double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman");
    double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method tests binary file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderBinary() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(binaryFile);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method tests ZIP file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderArchive() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(w2v);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testUnifiedLoaderArchive1() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1Neg());
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

@Test
public void testUnifiedLoaderArchive2() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile();

    WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("night");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    assertNotEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * This method tests CSV file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);

    // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model
    assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1());
}

Source File: Windows.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Constructs a list of window of size windowSize.
 * Note that padding for each window is created as well.
 * @param words the words to tokenize and construct windows from
 * @param tokenizerFactory tokenizer factory to use
 * @param windowSize the window size to generate
 * @return the list of windows for the tokenized string
 */
public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize,
                WordVectors vectors) {
    Tokenizer tokenizer = tokenizerFactory.create(words);
    List<String> list = new ArrayList<>();
    while (tokenizer.hasMoreTokens()) {
        String token = tokenizer.nextToken();

        // if we don't have UNK word defined - we have to skip this word
        if (vectors.getWordVectorMatrix(token) != null)
            list.add(token);
    }

    if (list.isEmpty())
        throw new IllegalStateException("No tokens found for windows");

    return windows(list, windowSize);
}

Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0

6 votes

@Ignore
@Test
public void testGoogleModelForInference() throws Exception {
    WordVectors googleVectors = WordVectorSerializer.readWord2VecModel(new File("/ext/GoogleNews-vectors-negative300.bin.gz"));

    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    ParagraphVectors pv =
                    new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10).useHierarchicSoftmax(false)
                                    .trainWordVectors(false).iterations(10).useExistingWordVectors(googleVectors)
                                    .negativeSample(10).sequenceLearningAlgorithm(new DM<VocabWord>()).build();

    INDArray vec1 = pv.inferVector("This text is pretty awesome");
    INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes");

    log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2));
}

Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method restores previously saved w2v model. File can be in one of the following formats:
 * 1) Binary model, either compressed or not. Like well-known Google Model
 * 2) Popular CSV word2vec text format
 * 3) DL4j compressed format
 *
 * In return you get StaticWord2Vec model, which might be used as lookup table only in multi-gpu environment.
 *
 * @param inputStream InputStream should point to previously saved w2v model
 * @return
 */
public static WordVectors loadStaticModel(InputStream inputStream) throws IOException {

    File tmpFile = DL4JFileUtils.createTempFile("word2vec"+System.currentTimeMillis(), ".tmp");
    FileUtils.copyInputStreamToFile(inputStream, tmpFile);
    try {
        return loadStaticModel(tmpFile);
    } finally {
        tmpFile.delete();
    }

}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method tests binary file loading via unified loader
 *
 * @throws Exception
 */
@Test
public void testUnifiedLoaderBinary() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.readWord2VecModel(binaryFile, false);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}

Source File: FlatModelUtilsTest.java From deeplearning4j with Apache License 2.0

5 votes

private static void printWords(String target, Collection<String> list, WordVectors vec) {
    System.out.println("Words close to [" + target + "]:");
    for (String word : list) {
        double sim = vec.similarity(target, word);
        System.out.print("'" + word + "': [" + sim + "]");
    }
    System.out.print("\n");
}

Source File: Word2VecTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
@Ignore
public void testPortugeseW2V() throws Exception {
    WordVectors word2Vec = WordVectorSerializer.loadTxtVectors(new File("/ext/Temp/para.txt"));
    word2Vec.setModelUtils(new FlatModelUtils());

    Collection<String> portu = word2Vec.wordsNearest("carro", 10);
    printWords("carro", portu, word2Vec);

    portu = word2Vec.wordsNearest("davi", 10);
    printWords("davi", portu, word2Vec);
}

Source File: Word2VecTest.java From deeplearning4j with Apache License 2.0

5 votes

private static void printWords(String target, Collection<String> list, WordVectors vec) {
    System.out.println("Words close to [" + target + "]:");
    for (String word : list) {
        double sim = vec.similarity(target, word);
        System.out.print("'" + word + "': [" + sim + "], ");
    }
    System.out.print("\n");
}

Source File: ParagraphVectors.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method allows you to use pre-built WordVectors model (e.g. Word2Vec) for ParagraphVectors.
 * Existing model will be transferred into new model before training starts.
 *
 * PLEASE NOTE: Non-normalized model is recommended to use here.
 *
 * @param vec existing WordVectors model
 * @return
 */
@Override
@SuppressWarnings("unchecked")
public Builder useExistingWordVectors(@NonNull WordVectors vec) {
    if (((InMemoryLookupTable<VocabWord>) vec.lookupTable()).getSyn1() == null
                    && ((InMemoryLookupTable<VocabWord>) vec.lookupTable()).getSyn1Neg() == null)
        throw new ND4JIllegalStateException("Model being passed as existing has no syn1/syn1Neg available");

    this.existingVectors = vec;
    return this;
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method tests CSV file loading as static model
 *
 * @throws Exception
 */
@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderText() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(textFile);

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testStaticLoaderFromStream() throws Exception {

    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile);
    WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(new FileInputStream(binaryFile));

    INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman");
    INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman");

    assertNotEquals(null, arrayLive);
    assertEquals(arrayLive, arrayStatic);
}

Source File: CnnTextFilesEmbeddingInstanceIteratorTest.java From wekaDeeplearning4j with GNU General Public License v3.0

5 votes

public Instances makeData() throws Exception {
  final Instances data = TestUtil.makeTestDataset(42,
      100,
      0,
      0,
      1,
      0,
      0,
      1,
      Attribute.NUMERIC,
      1,
      false);

  WordVectors wordVectors = WordVectorSerializer
      .loadStaticModel(DatasetLoader.loadGoogleNewsVectors());
  String[] words = (String[]) wordVectors.vocab().words().toArray(new String[0]);

  Random rand = new Random(42);
  for (Instance inst : data) {
    StringBuilder sentence = new StringBuilder();
    for (int i = 0; i < 10; i++) {
      final int idx = rand.nextInt(words.length);
      sentence.append(" ").append(words[idx]);
    }
    inst.setValue(0, sentence.toString());
  }
  return data;
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
@Ignore
public void testLoader() throws Exception {
    WordVectors vec = WordVectorSerializer.loadTxtVectors(new File("/home/raver119/Downloads/_vectors.txt"));

    logger.info("Rewinding: " + Arrays.toString(vec.getWordVector("rewinding")));
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
@Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912")
public void testIndexPersistence() throws Exception {
    File inputFile = Resources.asFile("big/raw_sentences.txt");
    SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath());
    // Split on white spaces in the line to get words
    TokenizerFactory t = new DefaultTokenizerFactory();
    t.setTokenPreProcessor(new CommonPreprocessor());

    Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100)
                    .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5)
                    .iterate(iter).tokenizerFactory(t).build();

    vec.fit();

    VocabCache orig = vec.getVocab();

    File tempFile = File.createTempFile("temp", "w2v");
    tempFile.deleteOnExit();

    WordVectorSerializer.writeWordVectors(vec, tempFile);

    WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile);

    VocabCache rest = vec2.vocab();

    assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs());

    for (VocabWord word : vec.getVocab().vocabWords()) {
        INDArray array1 = vec.getWordVectorMatrix(word.getLabel());
        INDArray array2 = vec2.getWordVectorMatrix(word.getLabel());

        assertEquals(array1, array2);
    }
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testLoaderBinary() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile);
    assertEquals(vec.vocab().numWords(), 30);
    assertTrue(vec.vocab().hasToken("Morgan_Freeman"));
    assertTrue(vec.vocab().hasToken("JA_Montalbano"));
    double[] wordVector1 = vec.getWordVector("Morgan_Freeman");
    double[] wordVector2 = vec.getWordVector("JA_Montalbano");
    assertTrue(wordVector1.length == 300);
    assertTrue(wordVector2.length == 300);
    assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3);
    assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3);
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * This method here is only to test real google model few gigabytes worth
 * Keep it ignored, since it requirs full google model being present in system, which is 1.6gb compressed
 *
 * @throws Exception
 */
@Test
@Ignore
public void testStaticLoaderGoogleModel() throws Exception {
    logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName());

    long time1 = System.currentTimeMillis();
    WordVectors vectors = WordVectorSerializer
                    .loadStaticModel(new File("C:\\Users\\raver\\develop\\GoogleNews-vectors-negative300.bin.gz"));
    long time2 = System.currentTimeMillis();

    logger.info("Loading time: {} ms", (time2 - time1));
}

Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testLoaderStream() throws IOException {
    WordVectors vec = WordVectorSerializer.readWord2VecModel(textFile);

    assertEquals(vec.vocab().numWords(), 30);
    assertTrue(vec.vocab().hasToken("Morgan_Freeman"));
    assertTrue(vec.vocab().hasToken("JA_Montalbano"));
}

org.deeplearning4j.models.embeddings.wordvectors.WordVectors Java Examples