Java Code Examples for org.deeplearning4j.models.embeddings.loader.WordVectorSerializer#writeWordVectors()
The following examples show how to use
org.deeplearning4j.models.embeddings.loader.WordVectorSerializer#writeWordVectors() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test @Ignore public void testWriteWordVectors() throws IOException { WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile); InMemoryLookupTable lookupTable = (InMemoryLookupTable) vec.lookupTable(); InMemoryLookupCache lookupCache = (InMemoryLookupCache) vec.vocab(); WordVectorSerializer.writeWordVectors(lookupTable, lookupCache, pathToWriteto); WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto)); double[] wordVector1 = wordVectors.getWordVector("Morgan_Freeman"); double[] wordVector2 = wordVectors.getWordVector("JA_Montalbano"); assertTrue(wordVector1.length == 300); assertTrue(wordVector2.length == 300); assertEquals(Doubles.asList(wordVector1).get(0), 0.044423, 1e-3); assertEquals(Doubles.asList(wordVector2).get(0), 0.051964, 1e-3); }
Example 2
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test @Ignore public void testWriteWordVectorsFromWord2Vec() throws IOException { WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true); WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto); WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto)); INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman"); INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano"); assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1); assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2); assertTrue(wordVector1.length() == 300); assertTrue(wordVector2.length() == 300); assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3); assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3); }
Example 3
Source File: ChineseTokenizerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Ignore @Test public void testFindNamesFromText() throws IOException { SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt"); log.info("load is right!"); TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory(); //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer()); //Generates a word-vector from the dataset stored in resources folder Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42) .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build(); vec.fit(); WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt")); //trains a model that can find out all names from news(Suffix txt),It uses word vector generated // WordVectors wordVectors; //test model,Whether the model find out name from unknow text; }
Example 4
Source File: Word2VecModelExample.java From Java-Deep-Learning-Cookbook with MIT License | 5 votes |
public static void main(String[] args) throws Exception { final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile()); SentenceDataPreProcessor.setPreprocessor(iterator); final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor()); final Word2Vec model = new Word2Vec.Builder() .iterate(iterator) .tokenizerFactory(tokenizerFactory) .minWordFrequency(5) .layerSize(100) .seed(42) .epochs(50) .windowSize(5) .build(); log.info("Fitting Word2Vec model...."); model.fit(); final Collection<String> words = model.wordsNearest("season",10); for(final String word: words){ System.out.println(word+ " "); } final double cosSimilarity = model.similarity("season","program"); System.out.println(cosSimilarity); BarnesHutTsne tsne = new BarnesHutTsne.Builder() .setMaxIter(100) .theta(0.5) .normalize(false) .learningRate(500) .useAdaGrad(false) .build(); //save word vectors for tSNE visualization. WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt")); WordVectorSerializer.writeWord2VecModel(model, "model.zip"); }
Example 5
Source File: Word2VecModelExample.java From Java-Deep-Learning-Cookbook with MIT License | 5 votes |
public static void main(String[] args) throws Exception { final SentenceIterator iterator = new LineSentenceIterator(new ClassPathResource("raw_sentences_large.txt").getFile()); SentenceDataPreProcessor.setPreprocessor(iterator); final TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); tokenizerFactory.setTokenPreProcessor(new EndingPreProcessor()); final Word2Vec model = new Word2Vec.Builder() .iterate(iterator) .tokenizerFactory(tokenizerFactory) .minWordFrequency(5) .layerSize(100) .seed(42) .epochs(50) .windowSize(5) .build(); log.info("Fitting Word2Vec model...."); model.fit(); final Collection<String> words = model.wordsNearest("season",10); for(final String word: words){ System.out.println(word+ " "); } final double cosSimilarity = model.similarity("season","program"); System.out.println(cosSimilarity); BarnesHutTsne tsne = new BarnesHutTsne.Builder() .setMaxIter(100) .theta(0.5) .normalize(false) .learningRate(500) .useAdaGrad(false) .build(); //save word vectors for tSNE visualization. WordVectorSerializer.writeWordVectors(model.lookupTable(),new File("words.txt")); WordVectorSerializer.writeWord2VecModel(model, "model.zip"); }
Example 6
Source File: Word2VecRawTextExample.java From Java-Data-Science-Cookbook with MIT License | 5 votes |
public static void main(String[] args) throws Exception { // Gets Path to Text file String filePath = "c:/raw_sentences.txt"; log.info("Load & Vectorize Sentences...."); // Strip white space before and after for each line SentenceIterator iter = UimaSentenceIterator.createWithPath(filePath); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); InMemoryLookupCache cache = new InMemoryLookupCache(); WeightLookupTable table = new InMemoryLookupTable.Builder() .vectorLength(100) .useAdaGrad(false) .cache(cache) .lr(0.025f).build(); log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder() .minWordFrequency(5).iterations(1) .layerSize(100).lookupTable(table) .stopWords(new ArrayList<String>()) .vocabCache(cache).seed(42) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); log.info("Fitting Word2Vec model...."); vec.fit(); log.info("Writing word vectors to text file...."); // Write word WordVectorSerializer.writeWordVectors(vec, "word2vec.txt"); log.info("Closest Words:"); Collection<String> lst = vec.wordsNearest("man", 5); System.out.println(lst); double cosSim = vec.similarity("cruise", "voyage"); System.out.println(cosSim); }
Example 7
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testIndexPersistence() throws Exception { File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath()); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100) .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5) .iterate(iter).tokenizerFactory(t).build(); vec.fit(); VocabCache orig = vec.getVocab(); File tempFile = File.createTempFile("temp", "w2v"); tempFile.deleteOnExit(); WordVectorSerializer.writeWordVectors(vec, tempFile); WordVectors vec2 = WordVectorSerializer.loadTxtVectors(tempFile); VocabCache rest = vec2.vocab(); assertEquals(orig.totalNumberOfDocs(), rest.totalNumberOfDocs()); for (VocabWord word : vec.getVocab().vocabWords()) { INDArray array1 = vec.getWordVectorMatrix(word.getLabel()); INDArray array2 = vec2.getWordVectorMatrix(word.getLabel()); assertEquals(array1, array2); } }
Example 8
Source File: Word2VecTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Ignore @Test public void testSparkW2VonBiggerCorpus() throws Exception { SparkConf sparkConf = new SparkConf().setMaster("local[8]").setAppName("sparktest") .set("spark.driver.host", "localhost") .set("spark.driver.maxResultSize", "4g").set("spark.driver.memory", "8g") .set("spark.executor.memory", "8g"); // Set SparkContext JavaSparkContext sc = new JavaSparkContext(sparkConf); // Path of data part-00000 //String dataPath = Resources.asFile("big/raw_sentences.txt").getAbsolutePath(); // String dataPath = "/ext/Temp/SampleRussianCorpus.txt"; String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath(); // Read in data JavaRDD<String> corpus = sc.textFile(dataPath); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new LowCasePreProcessor()); Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1) // .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory") // .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor") // .setRemoveStop(false) .tokenizerFactory(t).seed(42L).negative(3).useAdaGrad(false).layerSize(100).windowSize(5) .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5) .useUnknown(true).build(); word2Vec.train(corpus); sc.stop(); WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), "/ext/Temp/sparkRuModel.txt"); }
Example 9
Source File: Word2VecTests.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testRunWord2Vec() throws Exception { String backend = Nd4j.getExecutioner().getEnvironmentInformation().getProperty("backend"); if(!isIntegrationTests() && "CUDA".equalsIgnoreCase(backend)) { skipUnlessIntegrationTests(); //AB 2020/02/06 Skip CUDA except for integration tests due to very slow test speed - > 5 minutes on Titan X } // Strip white space before and after for each line /*val shakespear = new ClassPathResource("big/rnj.txt"); SentenceIterator iter = new BasicLineIterator(shakespear.getFile());*/ SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath()); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(1).batchSize(8192).layerSize(100) .stopWords(new ArrayList<String>()).seed(42).learningRate(0.025).minLearningRate(0.001) .sampling(0).elementsLearningAlgorithm(new SkipGram<VocabWord>()) //.negativeSample(10) .epochs(1).windowSize(5).allowParallelTokenization(true) .workers(6) .usePreciseMode(true) .modelUtils(new BasicModelUtils<VocabWord>()).iterate(iter).tokenizerFactory(t).build(); assertEquals(new ArrayList<String>(), vec.getStopWords()); vec.fit(); File tempFile = File.createTempFile("temp", "temp"); tempFile.deleteOnExit(); WordVectorSerializer.writeFullModel(vec, tempFile.getAbsolutePath()); Collection<String> lst = vec.wordsNearest("day", 10); //log.info(Arrays.toString(lst.toArray())); printWords("day", lst, vec); assertEquals(10, lst.size()); double sim = vec.similarity("day", "night"); log.info("Day/night similarity: " + sim); assertTrue(sim < 1.0); assertTrue(sim > 0.4); assertTrue(lst.contains("week")); assertTrue(lst.contains("night")); assertTrue(lst.contains("year")); assertFalse(lst.contains(null)); lst = vec.wordsNearest("day", 10); //log.info(Arrays.toString(lst.toArray())); printWords("day", lst, vec); assertTrue(lst.contains("week")); assertTrue(lst.contains("night")); assertTrue(lst.contains("year")); new File("cache.ser").delete(); ArrayList<String> labels = new ArrayList<>(); labels.add("day"); labels.add("night"); labels.add("week"); INDArray matrix = vec.getWordVectors(labels); assertEquals(matrix.getRow(0, true), vec.getWordVectorMatrix("day")); assertEquals(matrix.getRow(1, true), vec.getWordVectorMatrix("night")); assertEquals(matrix.getRow(2, true), vec.getWordVectorMatrix("week")); WordVectorSerializer.writeWordVectors(vec, pathToWriteto); }
Example 10
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test @Ignore("AB 2019/06/24 - Failing: Ignored to get to all passing baseline to prevent regressions via CI - see issue #7912") public void testOutputStream() throws Exception { File file = File.createTempFile("tmp_ser", "ssa"); file.deleteOnExit(); File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = new BasicLineIterator(inputFile); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); InMemoryLookupCache cache = new InMemoryLookupCache(false); WeightLookupTable table = new InMemoryLookupTable.Builder().vectorLength(100).useAdaGrad(false).negative(5.0) .cache(cache).lr(0.025f).build(); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(1).epochs(1).layerSize(100) .lookupTable(table).stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5) .vocabCache(cache).seed(42) // .workers(6) .windowSize(5).iterate(iter).tokenizerFactory(t).build(); assertEquals(new ArrayList<String>(), vec.getStopWords()); vec.fit(); INDArray day1 = vec.getWordVectorMatrix("day"); WordVectorSerializer.writeWordVectors(vec, new FileOutputStream(file)); WordVectors vec2 = WordVectorSerializer.loadTxtVectors(file); INDArray day2 = vec2.getWordVectorMatrix("day"); assertEquals(day1, day2); File tempFile = File.createTempFile("tetsts", "Fdfs"); tempFile.deleteOnExit(); WordVectorSerializer.writeWord2VecModel(vec, tempFile); Word2Vec vec3 = WordVectorSerializer.readWord2VecModel(tempFile); }
Example 11
Source File: Word2VecTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testConcepts() throws Exception { // These are all default values for word2vec SparkConf sparkConf = new SparkConf().setMaster("local[8]") .set("spark.driver.host", "localhost") .setAppName("sparktest"); // Set SparkContext JavaSparkContext sc = new JavaSparkContext(sparkConf); // Path of data part-00000 String dataPath = new ClassPathResource("big/raw_sentences.txt").getFile().getAbsolutePath(); // dataPath = "/ext/Temp/part-00000"; // String dataPath = new ClassPathResource("spark_word2vec_test.txt").getFile().getAbsolutePath(); // Read in data JavaRDD<String> corpus = sc.textFile(dataPath); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec word2Vec = new Word2Vec.Builder().setNGrams(1) // .setTokenizer("org.deeplearning4j.text.tokenization.tokenizerfactory.DefaultTokenizerFactory") // .setTokenPreprocessor("org.deeplearning4j.text.tokenization.tokenizer.preprocessor.CommonPreprocessor") // .setRemoveStop(false) .tokenizerFactory(t).seed(42L).negative(10).useAdaGrad(false).layerSize(150).windowSize(5) .learningRate(0.025).minLearningRate(0.0001).iterations(1).batchSize(100).minWordFrequency(5) .stopWords(Arrays.asList("three")).useUnknown(true).build(); word2Vec.train(corpus); //word2Vec.setModelUtils(new FlatModelUtils()); System.out.println("UNK: " + word2Vec.getWordVectorMatrix("UNK")); InMemoryLookupTable<VocabWord> table = (InMemoryLookupTable<VocabWord>) word2Vec.lookupTable(); double sim = word2Vec.similarity("day", "night"); System.out.println("day/night similarity: " + sim); /* System.out.println("Hornjo: " + word2Vec.getWordVectorMatrix("hornjoserbsce")); System.out.println("carro: " + word2Vec.getWordVectorMatrix("carro")); Collection<String> portu = word2Vec.wordsNearest("carro", 10); printWords("carro", portu, word2Vec); portu = word2Vec.wordsNearest("davi", 10); printWords("davi", portu, word2Vec); System.out.println("---------------------------------------"); */ Collection<String> words = word2Vec.wordsNearest("day", 10); printWords("day", words, word2Vec); assertTrue(words.contains("night")); assertTrue(words.contains("week")); assertTrue(words.contains("year")); sim = word2Vec.similarity("two", "four"); System.out.println("two/four similarity: " + sim); words = word2Vec.wordsNearest("two", 10); printWords("two", words, word2Vec); // three should be absent due to stopWords assertFalse(words.contains("three")); assertTrue(words.contains("five")); assertTrue(words.contains("four")); sc.stop(); // test serialization File tempFile = testDir.newFile("temp" + System.currentTimeMillis() + ".tmp"); int idx1 = word2Vec.vocab().wordFor("day").getIndex(); INDArray array1 = word2Vec.getWordVectorMatrix("day").dup(); VocabWord word1 = word2Vec.vocab().elementAtIndex(0); WordVectorSerializer.writeWordVectors(word2Vec.getLookupTable(), tempFile); WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile); VocabWord word2 = ((VocabCache<VocabWord>) vectors.vocab()).elementAtIndex(0); VocabWord wordIT = ((VocabCache<VocabWord>) vectors.vocab()).wordFor("it"); int idx2 = vectors.vocab().wordFor("day").getIndex(); INDArray array2 = vectors.getWordVectorMatrix("day").dup(); System.out.println("word 'i': " + word2); System.out.println("word 'it': " + wordIT); assertEquals(idx1, idx2); assertEquals(word1, word2); assertEquals(array1, array2); }
Example 12
Source File: UITest.java From deeplearning4j with Apache License 2.0 | 3 votes |
@Test public void testPosting() throws Exception { // File inputFile = Resources.asFile("big/raw_sentences.txt"); File inputFile = new ClassPathResource("/basic/word2vec_advance.txt").getFile(); SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath()); // Split on white spaces in the line to get words TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).epochs(1).layerSize(20) .stopWords(new ArrayList<String>()).useAdaGrad(false).negativeSample(5).seed(42).windowSize(5) .iterate(iter).tokenizerFactory(t).build(); vec.fit(); File tempFile = File.createTempFile("temp", "w2v"); tempFile.deleteOnExit(); WordVectorSerializer.writeWordVectors(vec, tempFile); WordVectors vectors = WordVectorSerializer.loadTxtVectors(tempFile); UIServer.getInstance(); //Initialize UiConnectionInfo uiConnectionInfo = new UiConnectionInfo.Builder().setAddress("localhost").setPort(9000).build(); BarnesHutTsne tsne = new BarnesHutTsne.Builder().normalize(false).setFinalMomentum(0.8f).numDimension(2) .setMaxIter(10).build(); vectors.lookupTable().plotVocab(tsne, vectors.lookupTable().getVocabCache().numWords(), uiConnectionInfo); Thread.sleep(100000); }