edu.stanford.nlp.process.DocumentPreprocessor Java Examples
The following examples show how to use
edu.stanford.nlp.process.DocumentPreprocessor.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: XMLProcessingDemo.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 6 votes |
public static void main(String args[]){ try { Reader reader = new FileReader(getResourcePath()); DocumentPreprocessor dp = new DocumentPreprocessor(reader, DocumentPreprocessor.DocType.XML); dp.setElementDelimiter("sentence"); for(List sentence : dp){ ListIterator list = sentence.listIterator(); while (list.hasNext()) { System.out.print(list.next() + " "); } System.out.println(); } } catch (FileNotFoundException ex) { Logger.getLogger(XMLProcessingDemo.class.getName()).log(Level.SEVERE, null, ex); } }
Example #2
Source File: DocumentFrequencyCounter.java From wiseowl with MIT License | 6 votes |
/** * Get an IDF map for the given document string. * * @param document * @return */ private static Counter<String> getIDFMapForDocument(String document) { // Clean up -- remove some Gigaword patterns that slow things down // / don't help anything document = headingSeparator.matcher(document).replaceAll(""); DocumentPreprocessor preprocessor = new DocumentPreprocessor(new StringReader(document)); preprocessor.setTokenizerFactory(tokenizerFactory); Counter<String> idfMap = new ClassicCounter<String>(); for (List<HasWord> sentence : preprocessor) { if (sentence.size() > MAX_SENTENCE_LENGTH) continue; List<TaggedWord> tagged = tagger.tagSentence(sentence); for (TaggedWord w : tagged) { if (w.tag().startsWith("n")) idfMap.incrementCount(w.word()); } } return idfMap; }
Example #3
Source File: TokenizerDemo.java From blog-codes with Apache License 2.0 | 5 votes |
public static void main(String[] args) throws IOException { for (String arg : args) { // option #1: By sentence. DocumentPreprocessor dp = new DocumentPreprocessor(arg); for (List<HasWord> sentence : dp) { System.out.println(sentence); } // option #2: By token PTBTokenizer<CoreLabel> ptbt = new PTBTokenizer<>(new FileReader(arg), new CoreLabelTokenFactory(), ""); while (ptbt.hasNext()) { CoreLabel label = ptbt.next(); System.out.println(label); } } }
Example #4
Source File: Chapter3.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static void usingStanfordDocumentPreprocessor() { // option #1: By sentence. Reader reader = new StringReader(paragraph); DocumentPreprocessor dp = new DocumentPreprocessor(reader); for (List sentence : dp) { System.out.println(sentence); } // try { // Reader reader = new FileReader("XMLText.xml"); // DocumentPreprocessor dp = new DocumentPreprocessor( // reader, DocumentPreprocessor.DocType.XML); // dp.setElementDelimiter("sentence"); // for (List sentence : dp) { // System.out.println(sentence); // } //} catch (FileNotFoundException ex) { // Handle exception //} // // option #2: By token // PTBTokenizer ptbt = new PTBTokenizer(reader, // new CoreLabelTokenFactory(), ""); // CoreLabel label; // while (ptbt.hasNext()) { // System.out.println(ptbt.next()); // } }
Example #5
Source File: CoreNLP.java From Criteria2Query with Apache License 2.0 | 5 votes |
public List<String> splitParagraph(String paragraph){ Reader reader = new StringReader(paragraph); DocumentPreprocessor dp = new DocumentPreprocessor(reader); List<String> sentenceList = new ArrayList<String>(); for (List<HasWord> sentence : dp) { String sentenceString = SentenceUtils.listToString(sentence); sentenceList.add(sentenceString); } return sentenceList; }
Example #6
Source File: ParseTree.java From NLIDB with Apache License 2.0 | 5 votes |
/** * Construct a parse tree using the stanford NLP parser. Only one sentence. * Here we are omitting the information of dependency labels (tags). * @param text input text. */ public ParseTree(String text, NLParser parser) { // pre-processing the input text DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text)); List<HasWord> sentence = null; for (List<HasWord> sentenceHasWord : tokenizer) { sentence = sentenceHasWord; break; } // part-of-speech tagging List<TaggedWord> tagged = parser.tagger.tagSentence(sentence); // dependency syntax parsing GrammaticalStructure gs = parser.parser.predict(tagged); // Reading the parsed sentence into ParseTree int N = sentence.size()+1; Node[] nodes = new Node[N]; root = new Node(0, "ROOT", "ROOT"); nodes[0] = root; for (int i = 0; i < N-1; i++) { nodes[i+1] = new Node(i+1, sentence.get(i).word(), tagged.get(i).tag()); } for (TypedDependency typedDep : gs.allTypedDependencies()) { int from = typedDep.gov().index(); int to = typedDep.dep().index(); // String label = typedDep.reln().getShortName(); // omitting the label nodes[to].parent = nodes[from]; nodes[from].children.add(nodes[to]); } }
Example #7
Source File: ParserDemo.java From NLIDB with Apache License 2.0 | 5 votes |
public static void main(String[] args) { String modelPath = DependencyParser.DEFAULT_MODEL; String taggerPath = "edu/stanford/nlp/models/pos-tagger/english-left3words/english-left3words-distsim.tagger"; for (int argIndex = 0; argIndex < args.length;) { switch (args[argIndex]) { case "-tagger": taggerPath = args[argIndex + 1]; argIndex += 2; break; case "-com.dukenlidb.nlidb.model": modelPath = args[argIndex + 1]; argIndex += 2; break; default: throw new RuntimeException("Unknown argument " + args[argIndex]); } } String text = "Return authors who have more papers than Bob in VLDB after 2000"; MaxentTagger tagger = new MaxentTagger(taggerPath); DependencyParser parser = DependencyParser.loadFromModelFile(modelPath); DocumentPreprocessor tokenizer = new DocumentPreprocessor(new StringReader(text)); for (List<HasWord> sentence : tokenizer) { List<TaggedWord> tagged = tagger.tagSentence(sentence); GrammaticalStructure gs = parser.predict(tagged); // Print typed dependencies log.info(gs); } }
Example #8
Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 4 votes |
private static void usingTheStanfordTokenizer() { // Using PTBTokenizer System.out.println("----PTBTokenizer Example"); // First example // PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph), // new CoreLabelTokenFactory(),null); // while (ptb.hasNext()) { // System.out.println(ptb.next()); // } // CoreLabel example CoreLabelTokenFactory ctf = new CoreLabelTokenFactory(); PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph), ctf, "invertible=true"); // PTBTokenizer ptb = new PTBTokenizer(new StringReader(paragraph), // new WordTokenFactory(), null); while (ptb.hasNext()) { CoreLabel cl = (CoreLabel) ptb.next(); System.out.println(cl.originalText() + " (" + cl.beginPosition() + "-" + cl.endPosition() + ")"); } // Using a DocumentPreprocessor System.out.println("----DocumentPreprocessor Example"); Reader reader = new StringReader(paragraph); DocumentPreprocessor documentPreprocessor = new DocumentPreprocessor(reader); Iterator<List<HasWord>> it = documentPreprocessor.iterator(); while (it.hasNext()) { List<HasWord> sentence = it.next(); for (HasWord token : sentence) { System.out.println(token); } } // for (List<HasWord> sentence : documentPreprocessor) { //// List<HasWord> sentence = it.next(); // for (HasWord token : sentence) { // System.out.println(token); // } // } // Using a pipeline System.out.println("----pipeline Example"); Properties properties = new Properties(); properties.put("annotators", "tokenize, ssplit"); StanfordCoreNLP pipeline = new StanfordCoreNLP(properties); Annotation annotation = new Annotation(paragraph); pipeline.annotate(annotation); pipeline.prettyPrint(annotation, System.out); }