org.deeplearning4j.text.documentiterator.LabelledDocument Java Examples
The following examples show how to use
org.deeplearning4j.text.documentiterator.LabelledDocument.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: BasicTransformerIterator.java From deeplearning4j with Apache License 2.0 | 6 votes |
@Override public Sequence<VocabWord> next() { LabelledDocument document = iterator.nextDocument(); if (document == null || document.getContent() == null) return new Sequence<>(); Sequence<VocabWord> sequence = sentenceTransformer.transformToSequence(document.getContent()); if (document.getLabels() != null) for (String label : document.getLabels()) { if (label != null && !label.isEmpty()) sequence.addSequenceLabel(new VocabWord(1.0, label)); } /* if (document.getLabel() != null && !document.getLabel().isEmpty()) { sequence.setSequenceLabel(new VocabWord(1.0, document.getLabel())); }*/ return sequence; }
Example #2
Source File: LabelAwareConverter.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public Pair<String, String> nextSentence() { LabelledDocument document = backingIterator.nextDocument(); // TODO: probably worth to allow more then one label? i.e. pass same document twice, sequentially return Pair.makePair(document.getContent(), document.getLabels().get(0)); }
Example #3
Source File: TfidfVectorizerTest.java From deeplearning4j with Apache License 2.0 | 5 votes |
public void testTfIdfVectorizerFromLabelAwareIterator() throws Exception { LabelledDocument doc1 = new LabelledDocument(); doc1.addLabel("dog"); doc1.setContent("it barks like a dog"); LabelledDocument doc2 = new LabelledDocument(); doc2.addLabel("cat"); doc2.setContent("it meows like a cat"); List<LabelledDocument> docs = new ArrayList<>(2); docs.add(doc1); docs.add(doc2); LabelAwareIterator iterator = new SimpleLabelAwareIterator(docs); TokenizerFactory tokenizerFactory = new DefaultTokenizerFactory(); TfidfVectorizer vectorizer = new TfidfVectorizer .Builder() .setMinWordFrequency(1) .setStopWords(new ArrayList<String>()) .setTokenizerFactory(tokenizerFactory) .setIterator(iterator) .allowParallelTokenization(false) .build(); vectorizer.fit(); DataSet dataset = vectorizer.vectorize("it meows like a cat", "cat"); assertNotNull(dataset); LabelsSource source = vectorizer.getLabelsSource(); assertEquals(2, source.getNumberOfLabelsUsed()); List<String> labels = source.getLabels(); assertEquals("dog", labels.get(0)); assertEquals("cat", labels.get(1)); }
Example #4
Source File: DocumentSequenceConvertFunction.java From deeplearning4j with Apache License 2.0 | 5 votes |
@Override public Sequence<VocabWord> call(LabelledDocument document) throws Exception { Sequence<VocabWord> sequence = new Sequence<>(); // get elements if (document.getReferencedContent() != null && !document.getReferencedContent().isEmpty()) { sequence.addElements(document.getReferencedContent()); } else { if (tokenizerFactory == null) instantiateTokenizerFactory(); List<String> tokens = tokenizerFactory.create(document.getContent()).getTokens(); for (String token : tokens) { if (token == null || token.isEmpty()) continue; VocabWord word = new VocabWord(1.0, token); sequence.addElement(word); } } // get labels for (String label : document.getLabels()) { if (label == null || label.isEmpty()) continue; VocabWord labelElement = new VocabWord(1.0, label); labelElement.markAsLabel(true); sequence.addSequenceLabel(labelElement); } return sequence; }
Example #5
Source File: ParagraphVectorsClassifierExample.java From Java-for-Data-Science with MIT License | 4 votes |
public static void main(String[] args) throws Exception { ClassPathResource resource = new ClassPathResource("paravec/labeled"); iter = new FileLabelAwareIterator.Builder() .addSourceFolder(resource.getFile()) .build(); tFact = new DefaultTokenizerFactory(); tFact.setTokenPreProcessor(new CommonPreprocessor()); pVect = new ParagraphVectors.Builder() .learningRate(0.025) .minLearningRate(0.001) .batchSize(1000) .epochs(20) .iterate(iter) .trainWordVectors(true) .tokenizerFactory(tFact) .build(); pVect.fit(); ClassPathResource unlabeledText = new ClassPathResource("paravec/unlabeled"); FileLabelAwareIterator unlabeledIter = new FileLabelAwareIterator.Builder() .addSourceFolder(unlabeledText.getFile()) .build(); MeansBuilder mBuilder = new MeansBuilder( (InMemoryLookupTable<VocabWord>) pVect.getLookupTable(), tFact); LabelSeeker lSeeker = new LabelSeeker(iter.getLabelsSource().getLabels(), (InMemoryLookupTable<VocabWord>) pVect.getLookupTable()); while (unlabeledIter.hasNextDocument()) { LabelledDocument doc = unlabeledIter.nextDocument(); INDArray docCentroid = mBuilder.documentAsVector(doc); List<Pair<String, Double>> scores = lSeeker.getScores(docCentroid); out.println("Document '" + doc.getLabel() + "' falls into the following categories: "); for (Pair<String, Double> score : scores) { out.println(" " + score.getFirst() + ": " + score.getSecond()); } } }
Example #6
Source File: ParallelTransformerIterator.java From deeplearning4j with Apache License 2.0 | 4 votes |
public CallableTransformer(LabelledDocument document, SentenceTransformer transformer) { this.transformer = transformer; this.document = document; }
Example #7
Source File: SentenceIteratorConverter.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Override public LabelledDocument next() { return nextDocument(); }
Example #8
Source File: ParagraphVectorsTest.java From deeplearning4j with Apache License 2.0 | 4 votes |
@Test public void testIterator() throws IOException { val folder_labeled = testDir.newFolder(); val folder_unlabeled = testDir.newFolder(); new ClassPathResource("/paravec/labeled/").copyDirectory(folder_labeled); new ClassPathResource("/paravec/unlabeled/").copyDirectory(folder_unlabeled); FileLabelAwareIterator labelAwareIterator = new FileLabelAwareIterator.Builder() .addSourceFolder(folder_labeled).build(); File resource_sentences = Resources.asFile("/big/raw_sentences.txt"); SentenceIterator iter = new BasicLineIterator(resource_sentences); int i = 0; for (; i < 10; ++i) { int j = 0; int labels = 0; int words = 0; while (labelAwareIterator.hasNextDocument()) { ++j; LabelledDocument document = labelAwareIterator.nextDocument(); labels += document.getLabels().size(); List<VocabWord> lst = document.getReferencedContent(); if (!CollectionUtils.isEmpty(lst)) words += lst.size(); } labelAwareIterator.reset(); //System.out.println(words + " " + labels + " " + j); assertEquals(0, words); assertEquals(30, labels); assertEquals(30, j); j = 0; while (iter.hasNext()) { ++j; iter.nextSentence(); } assertEquals(97162, j); iter.reset(); } }
Example #9
Source File: SparkParagraphVectors.java From deeplearning4j with Apache License 2.0 | 3 votes |
/** * This method builds ParagraphVectors model, expecting JavaRDD<LabelledDocument>. * It can be either non-tokenized documents, or tokenized. * * @param documentsRdd */ public void fitLabelledDocuments(JavaRDD<LabelledDocument> documentsRdd) { validateConfiguration(); broadcastEnvironment(new JavaSparkContext(documentsRdd.context())); JavaRDD<Sequence<VocabWord>> sequenceRDD = documentsRdd.map(new DocumentSequenceConvertFunction(configurationBroadcast)); super.fitSequences(sequenceRDD); }