Java Code Examples for org.deeplearning4j.models.embeddings.loader.WordVectorSerializer#loadTxtVectors()
The following examples show how to use
org.deeplearning4j.models.embeddings.loader.WordVectorSerializer#loadTxtVectors() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test @Ignore public void testWriteWordVectors() throws IOException { WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile); InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable(); InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab(); WordVectorSerializer.writeWordVectors(lookupTable, lookupCache, pathToWriteto); WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto)); double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman"); double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano"); assertTrue(wordVector1.length == 300); assertTrue(wordVector2.length == 300); assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3); assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3); }
Example 2
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test @Ignore public void testWriteWordVectorsFromWord2Vec() throws IOException { WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true); WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto); WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto)); INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman"); INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano"); assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1); assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2); assertTrue(wordVector1.length() == 300); assertTrue(wordVector2.length() == 300); assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3); assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3); }
Example 3
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * This method tests CSV file loading via unified loader * * @throws Exception */ @Test public void testUnifiedLoaderText() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile); WordVectors vectorsUnified = WordVectorSerializer.readWord2VecModel(textFile, true); INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman"); INDArray arrayStatic = vectorsUnified.getWordVectorMatrix("Morgan_Freeman"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); // we're trying EXTENDED model, but file doesn't have syn1/huffman info, so it should be silently degraded to simplified model assertEquals(null, ((InMemoryLookupTable) vectorsUnified.lookupTable()).getSyn1()); }
Example 4
Source File: Word2VecTests.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test public void testLoadingWordVectors() throws Exception { String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend"); if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) { skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X } File modelFile = new File(pathToWriteto); if (!modelFile.exists()) { testRunWord2Vec(); } WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(modelFile); Collection<String> lst = wordVectors.wordsNearest("day", 10); System.out.println(Arrays.toString(lst.toArray())); }
Example 5
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testIndexPersistence() throws Exception { File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath()); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100) .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5) .iterate(iter).tokenizerFactory(t).build(); vec.fit(); VocabCache orig = vec.getVocab(); File tempFile = File.createTempFile("temp", "w2v"); tempFile.deleteOnExit(); WordVectorSerializer.writeWordVectors(vec, tempFile); WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile); VocabCache rest = vec2.vocab(); assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs()); for (VocabWord word : vec.getVocab().vocabWords()) { INDArray array1 = vec.getWordVectorMatrix(word.getLabel()); INDArray array2 = vec2.getWordVectorMatrix(word.getLabel()); assertEquals(array1, array2); } }
Example 6
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test @Ignore public void testLoader() throws Exception { WordVectors vec = WordVectorSerializer.loadTxtVectors(new File("/home/raver119/Downloads/_vectors.txt")); logger.info("Rewinding: " + Arrays.toString(vec.getWordVector("rewinding"))); }
Example 7
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This method tests CSV file loading as static model * * @throws Exception */ @Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testStaticLoaderText() throws Exception { logger.info("Executor name: {}", Nd4j.getExecutioner().getClass().getSimpleName()); WordVectors vectorsLive = WordVectorSerializer.loadTxtVectors(textFile); WordVectors vectorsStatic = WordVectorSerializer.loadStaticModel(textFile); INDArray arrayLive = vectorsLive.getWordVectorMatrix("Morgan_Freeman"); INDArray arrayStatic = vectorsStatic.getWordVectorMatrix("Morgan_Freeman"); assertNotEquals(null, arrayLive); assertEquals(arrayLive, arrayStatic); }
Example 8
Source File: Word2VecTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test @Ignore public void testPortugeseW2V() throws Exception { WordVectors word2Vec = WordVectorSerializer.loadTxtVectors(new File("/ext/Temp/para.txt")); word2Vec.setModelUtils(new FlatModelUtils()); Collection<String> portu = word2Vec.wordsNearest("carro", 10); printWords("carro", portu, word2Vec); portu = word2Vec.wordsNearest("davi", 10); printWords("davi", portu, word2Vec); }
Example 9
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testOutputStream() throws Exception { File file = File.createTempFile("tmp_ser", "ssa"); file.deleteOnExit(); File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = new BasicLineIterator(inputFile); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); InMemoryLookupCache cache = new InMemoryLookupCache(false); WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0) .cache(cache).lr(0.025f).build(); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100) .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5) .vocabCache(cache).seed(42) // .workers(6) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); assertEquals(new ArrayList<String>(), vec.getStopWords()); vec.fit(); INDArray day1 = vec.getWordVectorMatrix("day"); WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file)); WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file); INDArray day2 = vec2.getWordVectorMatrix("day"); assertEquals(day1, day2); File tempFile = File.createTempFile("tetsts", "Fdfs"); tempFile.deleteOnExit(); WordVectorSerializer.writeWord2VecModel(vec, tempFile); Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile); }
Example 10
Source File: Word2VecTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testConcepts() throws Exception { // These are all default values for word2vec SparkConf sparkConf = new SparkConf().setMaster("local[8]") .set("spark.driver.host", "localhost") .setAppName("sparktest"); // Set SparkContext JavaSparkContext sc = new JavaSparkContext(sparkConf); // Path of data part-00000 String dataPath = new ClassPathResource("big/raw_sentences.txt").getFile().getAbsolutePath(); // dataPath = "/ext/Temp/part-00000"; // String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath(); // Read in data JavaRDD<String> corpus = sc.textFile(dataPath); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1) // .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory") // .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor") // .setRemoveStop(false) .tokenizerFactory(t).seed(42L).negative(10).useAdaGrad(false).layerSize(150).windowSize(5) .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5) .stopWords(Arrays.asList("three")).useUnknown(true).build(); word2Vec.train(corpus); //word2Vec.setModelUtils(new FlatModelUtils()); System.out.println("UNK: " + word2Vec.getWordVectorMatrix("UNK")); InMemoryLookupTable<VocabWord> table = (InMemoryLookupTable<VocabWord>) word2Vec.lookupTable(); double sim = word2Vec.similarity("day", "night"); System.out.println("day/night similarity: " + sim); /* System.out.println("Hornjo: " + word2Vec.getWordVectorMatrix("hornjoserbsce")); System.out.println("carro: " + word2Vec.getWordVectorMatrix("carro")); Collection<String> portu = word2Vec.wordsNearest("carro", 10); printWords("carro", portu, word2Vec); portu = word2Vec.wordsNearest("davi", 10); printWords("davi", portu, word2Vec); System.out.println("---------------------------------------"); */ Collection<String> words = word2Vec.wordsNearest("day", 10); printWords("day", words, word2Vec); assertTrue(words.contains("night")); assertTrue(words.contains("week")); assertTrue(words.contains("year")); sim = word2Vec.similarity("two", "four"); System.out.println("two/four similarity: " + sim); words = word2Vec.wordsNearest("two", 10); printWords("two", words, word2Vec); // three should be absent due to stopWords assertFalse(words.contains("three")); assertTrue(words.contains("five")); assertTrue(words.contains("four")); sc.stop(); // test serialization File tempFile = testDir.newFile("temp" + System.currentTimeMillis() + ".tmp"); int idx1 = word2Vec.vocab().wordFor("day").getIndex(); INDArray array1 = word2Vec.getWordVectorMatrix("day").dup(); VocabWord word1 = word2Vec.vocab().elementAtIndex(0); WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), tempFile); WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile); VocabWord word2 = ((VocabCache<VocabWord>) vectors.vocab()).elementAtIndex(0); VocabWord wordIT = ((VocabCache<VocabWord>) vectors.vocab()).wordFor("it"); int idx2 = vectors.vocab().wordFor("day").getIndex(); INDArray array2 = vectors.getWordVectorMatrix("day").dup(); System.out.println("word 'i': " + word2); System.out.println("word 'it': " + wordIT); assertEquals(idx1, idx2); assertEquals(word1, word2); assertEquals(array1, array2); }
Example 11
Source File: UITest.java From deeplearning4j with Apache License 2.0 | 3 votes |
@Test public void testPosting() throws Exception { // File inputFile = Resources.asFile("big/raw_sentences.txt"); File inputFile = new ClassPathResource("/basic/word2vec_advance.txt").getFile(); SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath()); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).epochs(1).layerSize(20) .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5) .iterate(iter).tokenizerFactory(t).build(); vec.fit(); File tempFile = File.createTempFile("temp", "w2v"); tempFile.deleteOnExit(); WordVectorSerializer.writeWordVectors(vec, tempFile); WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile); UIServer.getInstance(); //Initialize UiConnectionInfo uiConnectionInfo = new UiConnectionInfo.Builder().setAddress("localhost").setPort(9000).build(); BarnesHutTsne tsne = new BarnesHutTsne.Builder().normalize(false).setFinalMomentum(0.8f).numDimension(2) .setMaxIter(10).build(); vectors.lookupTable().plotVocab(tsne, vectors.lookupTable().getVocabCache().numWords(), uiConnectionInfo); Thread.sleep(100000); }