org.deeplearning4j.models.word2vec.Word2Vec Java Examples
The following examples show how to use
org.deeplearning4j.models.word2vec.Word2Vec.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ManualTests.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test(timeout = 300000) public void testWord2VecPlot() throws Exception { File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath()); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(5).iterations(2).batchSize(1000).learningRate(0.025) .layerSize(100).seed(42).sampling(0).negativeSample(0).windowSize(5) .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(10) .tokenizerFactory(t).build(); vec.fit(); // UiConnectionInfo connectionInfo = UiServer.getInstance().getConnectionInfo(); // vec.getLookupTable().plotVocab(100, connectionInfo); Thread.sleep(10000000000L); fail("Not implemented"); }
Example #2
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 6 votes |
public static Word2Vec readAsBinaryNoLineBreaks(@NonNull InputStream inputStream) { boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive(); int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency(); // try to load without linebreaks try { if (originalPeriodic) { Nd4j.getMemoryManager().togglePeriodicGc(true); } Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq); return readBinaryModel(inputStream, false, false); } catch (Exception readModelException) { log.error("Cannot read binary model", readModelException); throw new RuntimeException("Unable to guess input file format. Please use corresponding loader directly"); } }
Example #3
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * This method loads Word2Vec model from binary input stream. * * @param inputStream binary input stream * @return Word2Vec */ public static Word2Vec readAsBinary(@NonNull InputStream inputStream) { boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive(); int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency(); // we fallback to trying binary model instead try { log.debug("Trying binary model restoration..."); if (originalPeriodic) { Nd4j.getMemoryManager().togglePeriodicGc(true); } Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq); return readBinaryModel(inputStream, true, false); } catch (Exception readModelException) { throw new RuntimeException(readModelException); } }
Example #4
Source File: ChineseTokenizerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Ignore @Test public void testFindNamesFromText() throws IOException { SentenceIterator iter = new BasicLineIterator("src/test/resources/chineseName.txt"); log.info("load is right!"); TokenizerFactory tokenizerFactory = new ChineseTokenizerFactory(); //tokenizerFactory.setTokenPreProcessor(new ChineseTokenizer()); //Generates a word-vector from the dataset stored in resources folder Word2Vec vec = new Word2Vec.Builder().minWordFrequency(2).iterations(5).layerSize(100).seed(42) .learningRate(0.1).windowSize(20).iterate(iter).tokenizerFactory(tokenizerFactory).build(); vec.fit(); WordVectorSerializer.writeWordVectors(vec, new File("src/test/resources/chineseNameWordVector.txt")); //trains a model that can find out all names from news(Suffix txt),It uses word vector generated // WordVectors wordVectors; //test model,Whether the model find out name from unknow text; }
Example #5
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * This method loads Word2Vec model from input stream * * @param stream InputStream * @param readExtendedTable boolean * @return Word2Vec */ public static Word2Vec readWord2Vec( @NonNull InputStream stream, boolean readExtendedTable) throws IOException { SequenceVectors<VocabWord> vectors = readSequenceVectors(stream, readExtendedTable); Word2Vec word2Vec = new Word2Vec .Builder(vectors.getConfiguration()) .layerSize(vectors.getLayerSize()) .build(); word2Vec.setVocab(vectors.getVocab()); word2Vec.setLookupTable(vectors.lookupTable()); word2Vec.setModelUtils(vectors.getModelUtils()); return word2Vec; }
Example #6
Source File: WordVectorSerializerTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test @Ignore public void testWriteWordVectorsFromWord2Vec() throws IOException { WordVectors vec = WordVectorSerializer.readWord2VecModel(binaryFile, true); WordVectorSerializer.writeWordVectors((Word2Vec) vec, pathToWriteto); WordVectors wordVectors = WordVectorSerializer.loadTxtVectors(new File(pathToWriteto)); INDArray wordVector1 = wordVectors.getWordVectorMatrix("Morgan_Freeman"); INDArray wordVector2 = wordVectors.getWordVectorMatrix("JA_Montalbano"); assertEquals(vec.getWordVectorMatrix("Morgan_Freeman"), wordVector1); assertEquals(vec.getWordVectorMatrix("JA_Montalbano"), wordVector2); assertTrue(wordVector1.length() == 300); assertTrue(wordVector2.length() == 300); assertEquals(wordVector1.getDouble(0), 0.044423, 1e-3); assertEquals(wordVector2.getDouble(0), 0.051964, 1e-3); }
Example #7
Source File: Word2VecIteratorTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test public void testLabeledExample() throws Exception { INDArray unk = vec.getWordVectorMatrix(Word2Vec.DEFAULT_UNK); assertNotEquals(null, unk); unk = vec.getWordVectorMatrix("2131241sdasdas"); assertNotEquals(null, unk); ClassPathResource resource = new ClassPathResource("/labeled/"); File dir = testDir.newFolder(); resource.copyDirectory(dir); Word2VecDataSetIterator iter = new Word2VecDataSetIterator(vec, new LabelAwareFileSentenceIterator(null, dir), Arrays.asList("negative", "positive", "neutral")); DataSet next = iter.next(); }
Example #8
Source File: Word2VecIteratorTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Before public void before() throws Exception { if (vec == null) { ClassPathResource resource = new ClassPathResource("/labeled/"); File dir = testDir.newFolder(); resource.copyDirectory(dir); SentenceIterator iter = UimaSentenceIterator.createWithPath(dir.getAbsolutePath()); new File("cache.ser").delete(); TokenizerFactory t = new UimaTokenizerFactory(); vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).layerSize(100) .stopWords(new ArrayList<String>()).useUnknown(true).windowSize(5).iterate(iter) .tokenizerFactory(t).build(); vec.fit(); } }
Example #9
Source File: VectorsConfigurationTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test(timeout = 300000) public void testFromW2V() throws Exception { VectorsConfiguration configuration = new VectorsConfiguration(); configuration.setHugeModelExpected(true); configuration.setWindow(5); configuration.setIterations(3); configuration.setLayersSize(200); configuration.setLearningRate(1.4d); configuration.setSampling(0.0005d); configuration.setMinLearningRate(0.25d); configuration.setEpochs(1); File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = UimaSentenceIterator.createWithPath(inputFile.getAbsolutePath()); Word2Vec vec = new Word2Vec.Builder(configuration).iterate(iter).build(); VectorsConfiguration configuration2 = vec.getConfiguration(); assertEquals(configuration, configuration2); }
Example #10
Source File: PerformanceTests.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Ignore @Test public void testWord2VecCBOWBig() throws Exception { SentenceIterator iter = new BasicLineIterator("/home/raver119/Downloads/corpus/namuwiki_raw.txt"); //iter = new BasicLineIterator("/home/raver119/Downloads/corpus/ru_sentences.txt"); //SentenceIterator iter = new BasicLineIterator("/ext/DATASETS/ru/Socials/ru_sentences.txt"); TokenizerFactory t = new KoreanTokenizerFactory(); //t = new DefaultTokenizerFactory(); //t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(1).iterations(5).learningRate(0.025).layerSize(150) .seed(42).sampling(0).negativeSample(0).useHierarchicSoftmax(true).windowSize(5) .modelUtils(new BasicModelUtils<VocabWord>()).useAdaGrad(false).iterate(iter).workers(8) .allowParallelTokenization(true).tokenizerFactory(t) .elementsLearningAlgorithm(new CBOW<VocabWord>()).build(); long time1 = System.currentTimeMillis(); vec.fit(); long time2 = System.currentTimeMillis(); log.info("Total execution time: {}", (time2 - time1)); }
Example #11
Source File: Word2VecCN.java From word2vec with Apache License 2.0 | 6 votes |
public Word2Vec fit() { log.info("Building model...."); Word2Vec vec = new Word2Vec.Builder() .minWordFrequency(minWordFrequency) .iterations(iterations) .layerSize(layerSize) .seed(seed) .windowSize(windowSize) .iterate(sentenceIterator) .tokenizerFactory(tokenizerFactory) .build(); log.info("Fitting Word2Vec model...."); vec.fit(); return vec; }
Example #12
Source File: WindowConverter.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Converts a window (each word in the window) * * in to a vector. * * Keep in mind each window is a multi word context. * * From there, each word uses the passed in model * as a lookup table to get what vectors are relevant * to the passed in windows * @param window the window to take in. * @param vec the model to use as a lookup table * @return a concacneated 1 row array * containing all of the numbers for each word in the window */ public static INDArray asExampleArray(Window window, Word2Vec vec, boolean normalize) { int length = vec.lookupTable().layerSize(); List<String> words = window.getWords(); int windowSize = vec.getWindow(); Preconditions.checkState(words.size() == vec.getWindow()); INDArray ret = Nd4j.create(1, length * windowSize); for (int i = 0; i < words.size(); i++) { String word = words.get(i); INDArray n = normalize ? vec.getWordVectorMatrixNormalized(word) : vec.getWordVectorMatrix(word); ret.put(new INDArrayIndex[] {NDArrayIndex.interval(i * vec.lookupTable().layerSize(), i * vec.lookupTable().layerSize() + vec.lookupTable().layerSize())}, n); } return ret; }
Example #13
Source File: FastTextTest.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Test public void testWordsStatistics() throws IOException { File output = testDir.newFile(); FastText fastText = FastText .builder() .supervised(true) .inputFile(inputFile.getAbsolutePath()) .outputFile(output.getAbsolutePath()) .build(); log.info("\nTraining supervised model ...\n"); fastText.fit(); File file = new File(output.getAbsolutePath() + ".vec"); Word2Vec word2Vec = WordVectorSerializer.readAsCsv(file); assertEquals(48, word2Vec.getVocab().numWords()); assertEquals("", 0.1667751520872116, word2Vec.similarity("Football", "teams"), 2e-3); assertEquals("", 0.10083991289138794, word2Vec.similarity("professional", "minutes"), 2e-3); assertEquals("", Double.NaN, word2Vec.similarity("java","cpp"), 0.0); assertThat(word2Vec.wordsNearest("association", 3), hasItems("Football", "Soccer", "men's")); }
Example #14
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This method just loads full compressed model. */ private static Word2Vec readAsExtendedModel(@NonNull File file) throws IOException { int originalFreq = Nd4j.getMemoryManager().getOccasionalGcFrequency(); boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive(); log.debug("Trying full model restoration..."); if (originalPeriodic) { Nd4j.getMemoryManager().togglePeriodicGc(true); } Nd4j.getMemoryManager().setOccasionalGcFrequency(originalFreq); return readWord2Vec(file); }
Example #15
Source File: GoogleNewsVectorExample.java From Java-Deep-Learning-Cookbook with MIT License | 5 votes |
public static void main(String[] args) { try{ File file = new File("{PATH-TO-GOOGLE-WORD-VECTOR}"); Word2Vec model = WordVectorSerializer.readWord2VecModel(file); System.out.println(Arrays.asList(model.wordsNearest("season",10))); } catch(ND4JIllegalStateException e){ System.out.println("Please provide proper directory path in place of: PATH-TO-GOOGLE-WORD-VECTOR"); } }
Example #16
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This method * 1) Binary model, either compressed or not. Like well-known Google Model * 2) Popular CSV word2vec text format * 3) DL4j compressed format * <p> * Please note: if extended data isn't available, only weights will be loaded instead. * * @param file model file * @param extendedModel if TRUE, we'll try to load HS states & Huffman tree info, if FALSE, only weights will be loaded * @return word2vec model */ public static Word2Vec readWord2VecModel(File file, boolean extendedModel) { if (!file.exists() || !file.isFile()) { throw new ND4JIllegalStateException("File [" + file.getAbsolutePath() + "] doesn't exist"); } boolean originalPeriodic = Nd4j.getMemoryManager().isPeriodicGcActive(); if (originalPeriodic) { Nd4j.getMemoryManager().togglePeriodicGc(false); } Nd4j.getMemoryManager().setOccasionalGcFrequency(50000); try { return readWord2Vec(file, extendedModel); } catch (Exception readSequenceVectors) { try { return extendedModel ? readAsExtendedModel(file) : readAsSimplifiedModel(file); } catch (Exception loadFromFileException) { try { return readAsCsv(file); } catch (Exception readCsvException) { try { return readAsBinary(file); } catch (Exception readBinaryException) { try { return readAsBinaryNoLineBreaks(file); } catch (Exception readModelException) { log.error("Unable to guess input file format", readModelException); throw new RuntimeException("Unable to guess input file format. Please use corresponding loader directly"); } } } } } }
Example #17
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
public static Word2Vec readAsBinaryNoLineBreaks(@NonNull File file) { try (InputStream inputStream = fileStream(file)) { return readAsBinaryNoLineBreaks(inputStream); } catch (IOException readCsvException) { throw new RuntimeException(readCsvException); } }
Example #18
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Write the tsne format * * @param vec the word vectors to use for labeling * @param tsne the tsne array to write * @param csv the file to use * @throws Exception */ public static void writeTsneFormat(Word2Vec vec, INDArray tsne, File csv) throws Exception { try (BufferedWriter write = new BufferedWriter(new OutputStreamWriter(new FileOutputStream(csv), StandardCharsets.UTF_8))) { int words = 0; InMemoryLookupCache l = (InMemoryLookupCache) vec.vocab(); for (String word : vec.vocab().words()) { if (word == null) { continue; } StringBuilder sb = new StringBuilder(); INDArray wordVector = tsne.getRow(l.wordFor(word).getIndex()); for (int j = 0; j < wordVector.length(); j++) { sb.append(wordVector.getDouble(j)); if (j < wordVector.length() - 1) { sb.append(","); } } sb.append(","); sb.append(word.replaceAll(" ", WHITESPACE_REPLACEMENT)); sb.append(" "); sb.append("\n"); write.write(sb.toString()); } log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize()); } }
Example #19
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
public static Word2Vec readAsBinary(@NonNull File file) { try (InputStream inputStream = fileStream(file)) { return readAsBinary(inputStream); } catch (IOException readCsvException) { throw new RuntimeException(readCsvException); } }
Example #20
Source File: Word2VecDataSetIteratorTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Basically all we want from this test - being able to finish without exceptions. */ @Test public void testIterator1() throws Exception { File inputFile = Resources.asFile("big/raw_sentences.txt"); SentenceIterator iter = ParagraphVectorsTest.getIterator(isIntegrationTests(), inputFile); // SentenceIterator iter = new BasicLineIterator(inputFile.getAbsolutePath()); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec vec = new Word2Vec.Builder().minWordFrequency(10) // we make sure we'll have some missing words .iterations(1).learningRate(0.025).layerSize(150).seed(42).sampling(0).negativeSample(0) .useHierarchicSoftmax(true).windowSize(5).modelUtils(new BasicModelUtils<VocabWord>()) .useAdaGrad(false).iterate(iter).workers(8).tokenizerFactory(t) .elementsLearningAlgorithm(new CBOW<VocabWord>()).build(); vec.fit(); List<String> labels = new ArrayList<>(); labels.add("positive"); labels.add("negative"); Word2VecDataSetIterator iterator = new Word2VecDataSetIterator(vec, getLASI(iter, labels), labels, 1); INDArray array = iterator.next().getFeatures(); int count = 0; while (iterator.hasNext()) { DataSet ds = iterator.next(); assertArrayEquals(array.shape(), ds.getFeatures().shape()); if(!isIntegrationTests() && count++ > 20) break; //raw_sentences.txt is 2.81 MB, takes quite some time to process. We'll only first 20 minibatches when doing unit tests } }
Example #21
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
public static Word2Vec readAsCsv(@NonNull File file) { try (InputStream inputStream = fileStream(file)) { return readAsCsv(inputStream); } catch (IOException readCsvException) { throw new RuntimeException(readCsvException); } }
Example #22
Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test(timeout = 300000) public void testJSONSerialization() { ParagraphVectors paragraphVectors = new ParagraphVectors.Builder().build(); AbstractCache<VocabWord> cache = new AbstractCache.Builder<VocabWord>().build(); val words = new VocabWord[3]; words[0] = new VocabWord(1.0, "word"); words[1] = new VocabWord(2.0, "test"); words[2] = new VocabWord(3.0, "tester"); for (int i = 0; i < words.length; ++i) { cache.addToken(words[i]); cache.addWordToIndex(i, words[i].getLabel()); } paragraphVectors.setVocab(cache); String json = null; Word2Vec unserialized = null; try { json = paragraphVectors.toJson(); log.info("{}", json.toString()); unserialized = ParagraphVectors.fromJson(json); } catch (Exception e) { log.error("",e); fail(); } assertEquals(cache.totalWordOccurrences(), ((ParagraphVectors) unserialized).getVocab().totalWordOccurrences()); assertEquals(cache.totalNumberOfDocs(), ((ParagraphVectors) unserialized).getVocab().totalNumberOfDocs()); for (int i = 0; i < words.length; ++i) { val cached = cache.wordAtIndex(i); val restored = ((ParagraphVectors) unserialized).getVocab().wordAtIndex(i); assertNotNull(cached); assertEquals(cached, restored); } }
Example #23
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This method loads Word2Vec model from csv file * * @param inputStream input stream * @return Word2Vec model */ public static Word2Vec readAsCsv(@NonNull InputStream inputStream) { VectorsConfiguration configuration = new VectorsConfiguration(); // let's try to load this file as csv file try { log.debug("Trying CSV model restoration..."); Pair<InMemoryLookupTable, VocabCache> pair = loadTxt(inputStream); Word2Vec.Builder builder = new Word2Vec .Builder() .lookupTable(pair.getFirst()) .useAdaGrad(false) .vocabCache(pair.getSecond()) .layerSize(pair.getFirst().layerSize()) // we don't use hs here, because model is incomplete .useHierarchicSoftmax(false) .resetModel(false); TokenizerFactory factory = getTokenizerFactory(configuration); if (factory != null) { builder.tokenizerFactory(factory); } return builder.build(); } catch (Exception ex) { throw new RuntimeException("Unable to load model in CSV format"); } }
Example #24
Source File: Word2VecDataFetcher.java From deeplearning4j with Apache License 2.0 | 5 votes |
public Word2VecDataFetcher(String path, Word2Vec vec, List<String> labels) { if (vec == null || labels == null || labels.isEmpty()) throw new IllegalArgumentException( "Unable to initialize due to missing argument or empty label applyTransformToDestination"); this.vec = vec; this.labels = labels; this.path = path; }
Example #25
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This method saves Word2Vec model to output stream * * @param word2Vec Word2Vec * @param stream OutputStream */ public static void writeWord2Vec(@NonNull Word2Vec word2Vec, @NonNull OutputStream stream) throws IOException { SequenceVectors<VocabWord> vectors = new SequenceVectors.Builder<VocabWord>(word2Vec.getConfiguration()) .layerSize(word2Vec.getLayerSize()).build(); vectors.setVocab(word2Vec.getVocab()); vectors.setLookupTable(word2Vec.getLookupTable()); vectors.setModelUtils(word2Vec.getModelUtils()); writeSequenceVectors(vectors, stream); }
Example #26
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * This method loads Word2Vec model from file * * @param file File * @param readExtendedTables boolean * @return Word2Vec */ public static Word2Vec readWord2Vec(@NonNull File file, boolean readExtendedTables) { try (InputStream inputStream = fileStream(file)) { return readWord2Vec(inputStream, readExtendedTables); } catch (Exception readSequenceVectors) { throw new RuntimeException(readSequenceVectors); } }
Example #27
Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Test @Ignore //AB 2020/02/06 - https://github.com/eclipse/deeplearning4j/issues/8677 public void testDirectInference() throws Exception { boolean isIntegration = isIntegrationTests(); File resource = Resources.asFile("/big/raw_sentences.txt"); SentenceIterator sentencesIter = getIterator(isIntegration, resource); ClassPathResource resource_mixed = new ClassPathResource("paravec/"); File local_resource_mixed = testDir.newFolder(); resource_mixed.copyDirectory(local_resource_mixed); SentenceIterator iter = new AggregatingSentenceIterator.Builder() .addSentenceIterator(sentencesIter) .addSentenceIterator(new FileSentenceIterator(local_resource_mixed)).build(); TokenizerFactory t = new DefaultTokenizerFactory(); t.setTokenPreProcessor(new CommonPreprocessor()); Word2Vec wordVectors = new Word2Vec.Builder().minWordFrequency(1).batchSize(250).iterations(1).epochs(1) .learningRate(0.025).layerSize(150).minLearningRate(0.001) .elementsLearningAlgorithm(new SkipGram<VocabWord>()).useHierarchicSoftmax(true).windowSize(5) .iterate(iter).tokenizerFactory(t).build(); wordVectors.fit(); ParagraphVectors pv = new ParagraphVectors.Builder().tokenizerFactory(t).iterations(10) .useHierarchicSoftmax(true).trainWordVectors(true).useExistingWordVectors(wordVectors) .negativeSample(0).sequenceLearningAlgorithm(new DM<VocabWord>()).build(); INDArray vec1 = pv.inferVector("This text is pretty awesome"); INDArray vec2 = pv.inferVector("Fantastic process of crazy things happening inside just for history purposes"); log.info("vec1/vec2: {}", Transforms.cosineSim(vec1, vec2)); }
Example #28
Source File: WordConverter.java From deeplearning4j with Apache License 2.0 | 5 votes |
public static INDArray toInputMatrix(List<Window> windows, Word2Vec vec) { int columns = vec.lookupTable().layerSize() * vec.getWindow(); int rows = windows.size(); INDArray ret = Nd4j.create(rows, columns); for (int i = 0; i < rows; i++) { ret.putRow(i, WindowConverter.asExampleMatrix(windows.get(i), vec)); } return ret; }
Example #29
Source File: InMemoryLookupTable.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * @param word * @return */ @Override public INDArray vector(String word) { if (word == null) return null; int idx = vocab.indexOf(word); if (idx < 0) { idx = vocab.indexOf(Word2Vec.DEFAULT_UNK); if (idx < 0) return null; } return syn0.getRow(idx, true); }
Example #30
Source File: WordVectorSerializer.java From deeplearning4j with Apache License 2.0 | 5 votes |
/** * Writes the word vectors to the given BufferedWriter. Note that this assumes an in memory cache. * BufferedWriter can be writer to local file, or hdfs file, or any compatible to java target. * * @param vec the word2vec to write * @param writer - BufferedWriter, where all data should be written to * the path to write * @deprecated Use {@link #writeWord2Vec(Word2Vec, OutputStream)} */ @Deprecated public static void writeWordVectors(@NonNull Word2Vec vec, @NonNull BufferedWriter writer) throws IOException { int words = 0; String str = vec.getVocab().numWords() + " " + vec.getLayerSize() + " " + vec.getVocab().totalNumberOfDocs(); log.debug("Saving header: {}", str); writer.write(str + "\n"); for (String word : vec.vocab().words()) { if (word == null) { continue; } StringBuilder sb = new StringBuilder(); sb.append(word.replaceAll(" ", WHITESPACE_REPLACEMENT)); sb.append(" "); INDArray wordVector = vec.getWordVectorMatrix(word); for (int j = 0; j < wordVector.length(); j++) { sb.append(wordVector.getDouble(j)); if (j < wordVector.length() - 1) { sb.append(" "); } } sb.append("\n"); writer.write(sb.toString()); words++; } try { writer.flush(); } catch (Exception e) { } log.info("Wrote " + words + " with size of " + vec.lookupTable().layerSize()); }