Java Code Examples for org.deeplearning4j.models.embeddings.wordvectors.WordVectors#getWordVectorMatrix()
The following examples show how to use
org.deeplearning4j.models.embeddings.wordvectors.WordVectors#getWordVectorMatrix() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: CnnSentenceDataSetIterator.java From wekaDeeplearning4j with GNU General Public License v3.0 | 6 votes |
/** * Constructor that uses {@link Builder} extended with stopwords. * * @param builder Builder */ protected CnnSentenceDataSetIterator(CnnSentenceDataSetIterator.Builder builder) { super(builder); this.stopwords = builder.stopwords; setUnknownWordHandling(UnknownWordHandling.UseUnknownVector); // Set unknown word WordVectors wordVectors = getWordVectors(); wordVectors.setUNK("UNKNOWN"); // Initialize unknown word manually INDArray unknown; if (getUseNormalizedWordVectors()) { unknown = wordVectors.getWordVectorMatrixNormalized(wordVectors.getUNK()); } else { unknown = wordVectors.getWordVectorMatrix(wordVectors.getUNK()); } setUnknown(unknown); }
Example 2
Source File: CnnSentenceDataSetIterator.java From wekaDeeplearning4j with GNU General Public License v3.0 | 6 votes |
/** * Constructor that uses {@link Builder} extended with stopwords. * * @param builder Builder */ protected CnnSentenceDataSetIterator(CnnSentenceDataSetIterator.Builder builder) { super(builder); this.stopwords = builder.stopwords; setUnknownWordHandling(UnknownWordHandling.UseUnknownVector); // Set unknown word WordVectors wordVectors = getWordVectors(); wordVectors.setUNK("UNKNOWN"); // Initialize unknown word manually INDArray unknown; if (getUseNormalizedWordVectors()) { unknown = wordVectors.getWordVectorMatrixNormalized(wordVectors.getUNK()); } else { unknown = wordVectors.getWordVectorMatrix(wordVectors.getUNK()); } setUnknown(unknown); }
Example 3
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test @Ignore public void testWriteWordVectorsFromWord2Vec() throws IOException { WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true); WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto); WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto)); INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman"); INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano"); assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1); assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2); assertTrue(wordVector1.length() == 300); assertTrue(wordVector2.length() == 300); assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3); assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3); }
Example 4
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * This method tests binary file loading as static model * * @throws Exception */ @Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testStaticLoaderBinary() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile); WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(binaryFile); INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman"); INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); }
Example 5
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * This method tests ZIP file loading as static model * * @throws Exception */ @Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testStaticLoaderArchive() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile(); WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v); WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(w2v); INDArray arrayLive = vectorsLive.getWordVectorMatrix("night"); INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("night"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); }
Example 6
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test public void testUnifiedLoaderArchive1() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile(); WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v); WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, false); INDArray arrayLive = vectorsLive.getWordVectorMatrix("night"); INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1()); assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1Neg()); }
Example 7
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test public void testUnifiedLoaderArchive2() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); File w2v = new ClassPathResource("word2vec.dl4j/file.w2v").getFile(); WordVectors vectorsLive = WordVectorSerializer.readWord2Vec(w2v); WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(w2v, true); INDArray arrayLive = vectorsLive.getWordVectorMatrix("night"); INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("night"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); assertNotEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1()); }
Example 8
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * This method tests CSV file loading via unified loader * * @throws Exception */ @Test public void testUnifiedLoaderText() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile); WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true); INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman"); INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1()); }
Example 9
Source File: Windows.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Constructs a list of window of size windowSize. * Note that padding for each window is created as well. * @param words the words to tokenize and construct windows from * @param tokenizerFactory tokenizer factory to use * @param windowSize the window size to generate * @return the list of windows for the tokenized string */ public static List<Window> windows(String words, @NonNull TokenizerFactory tokenizerFactory, int windowSize, WordVectors vectors) { Tokenizer tokenizer = tokenizerFactory.create(words); List<String> list = new ArrayList<>(); while (tokenizer.hasMoreTokens()) { String token = tokenizer.nextToken(); // if we don't have UNK word defined - we have to skip this word if (vectors.getWordVectorMatrix(token) != null) list.add(token); } if (list.isEmpty()) throw new IllegalStateException("No tokens found for windows"); return windows(list, windowSize); }
Example 10
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testIndexPersistence() throws Exception { File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath()); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100) .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5) .iterate(iter).tokenizerFactory(t).build(); vec.fit(); VocabCache orig = vec.getVocab(); File tempFile = File.createTempFile("temp", "w2v"); tempFile.deleteOnExit(); WordVectorSerializer.writeWordVectors(vec, tempFile); WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile); VocabCache rest = vec2.vocab(); assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs()); for (VocabWord word : vec.getVocab().vocabWords()) { INDArray array1 = vec.getWordVectorMatrix(word.getLabel()); INDArray array2 = vec2.getWordVectorMatrix(word.getLabel()); assertEquals(array1, array2); } }
Example 11
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testStaticLoaderFromStream() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile); WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(new FileInputStream(binaryFile)); INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman"); INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); }
Example 12
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This method tests CSV file loading as static model * * @throws Exception */ @Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testStaticLoaderText() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile); WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(textFile); INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman"); INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); }
Example 13
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This method tests binary file loading via unified loader * * @throws Exception */ @Test public void testUnifiedLoaderBinary() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); WordVectors vectorsLive = WordVectorSerializer.readWord2VecModel(binaryFile); WordVectors vectorsStatic = WordVectorSerializer.readWord2VecModel(binaryFile, false); INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman"); INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); }
Example 14
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testOutputStream() throws Exception { File file = File.createTempFile("tmp_ser", "ssa"); file.deleteOnExit(); File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = new BasicLineIterator(inputFile); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); InMemoryLookupCache cache = new InMemoryLookupCache(false); WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0) .cache(cache).lr(0.025f).build(); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100) .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5) .vocabCache(cache).seed(42) // .workers(6) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); assertEquals(new ArrayList<String>(), vec.getStopWords()); vec.fit(); INDArray day1 = vec.getWordVectorMatrix("day"); WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file)); WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file); INDArray day2 = vec2.getWordVectorMatrix("day"); assertEquals(day1, day2); File tempFile = File.createTempFile("tetsts", "Fdfs"); tempFile.deleteOnExit(); WordVectorSerializer.writeWord2VecModel(vec, tempFile); Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile); }
Example 15
Source File: TestCnnSentenceDataSetIterator.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testCnnSentenceDataSetIteratorUseUnknownVector() throws Exception { WordVectors w2v = WordVectorSerializer .readWord2VecModel(new ClassPathResource("word2vec/googleload/sample_vec.bin").getFile()); List<String> sentences = new ArrayList<>(); sentences.add("these balance Database model"); sentences.add("into same THISWORDDOESNTEXIST are"); //Last 2 sentences - no valid words sentences.add("NOVALID WORDSHERE"); sentences.add("!!!"); List<String> labelsForSentences = Arrays.asList("Positive", "Negative", "Positive", "Negative"); LabeledSentenceProvider p = new CollectionLabeledSentenceProvider(sentences, labelsForSentences, null); CnnSentenceDataSetIterator dsi = new CnnSentenceDataSetIterator.Builder(CnnSentenceDataSetIterator.Format.CNN1D) .unknownWordHandling(CnnSentenceDataSetIterator.UnknownWordHandling.UseUnknownVector) .sentenceProvider(p).wordVectors(w2v) .useNormalizedWordVectors(true) .maxSentenceLength(256).minibatchSize(4).sentencesAlongHeight(false).build(); assertTrue(dsi.hasNext()); DataSet ds = dsi.next(); assertFalse(dsi.hasNext()); INDArray f = ds.getFeatures(); assertEquals(4, f.size(0)); INDArray unknown = w2v.getWordVectorMatrix(w2v.getUNK()); if(unknown == null) unknown = Nd4j.create(DataType.FLOAT, f.size(1)); assertEquals(unknown, f.get(NDArrayIndex.point(2), NDArrayIndex.all(), NDArrayIndex.point(0))); assertEquals(unknown, f.get(NDArrayIndex.point(2), NDArrayIndex.all(), NDArrayIndex.point(1))); assertEquals(unknown.like(), f.get(NDArrayIndex.point(2), NDArrayIndex.all(), NDArrayIndex.point(3))); assertEquals(unknown, f.get(NDArrayIndex.point(3), NDArrayIndex.all(), NDArrayIndex.point(0))); assertEquals(unknown.like(), f.get(NDArrayIndex.point(2), NDArrayIndex.all(), NDArrayIndex.point(1))); //Sanity check on single sentence loading: INDArray allKnownWords = dsi.loadSingleSentence("these balance"); INDArray withUnknown = dsi.loadSingleSentence("these NOVALID"); INDArray allUnknown = dsi.loadSingleSentence("NOVALID AlsoNotInVocab"); assertNotNull(allKnownWords); assertNotNull(withUnknown); assertNotNull(allUnknown); }