edu.stanford.nlp.process.Tokenizer Java Examples
The following examples show how to use
edu.stanford.nlp.process.Tokenizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TypeClassifier.java From winter with Apache License 2.0 | 6 votes |
/** * Initializes the tokenizer to detect date columns. */ public void initialize() { Properties props = new Properties(); pipeline.addAnnotator(new TokenizerAnnotator(false) { @Override public Tokenizer<CoreLabel> getTokenizer(Reader r) { // TODO Auto-generated method stub return new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), ""); } }); pipeline.addAnnotator(new WordsToSentencesAnnotator(false)); pipeline.addAnnotator(new POSTaggerAnnotator(false)); pipeline.addAnnotator(new TimeAnnotator("sutime", props)); }
Example #2
Source File: StanfordParser.java From OpenEphyra with GNU General Public License v2.0 | 6 votes |
/** * Parses a sentence and returns the PCFG score as a confidence measure. * * @param sentence * a sentence * @return PCFG score */ @SuppressWarnings("unchecked") public static double getPCFGScore(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce PCFG score log.debug("Parsing sentence"); double score; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer( new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); score = parser.getPCFGScore(); } return score; }
Example #3
Source File: StanfordLexicalDemo.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
public static void main(String args[]){ String parseModel = getResourcePath() + "englishPCFG.ser.gz"; LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parseModel); String [] sentenceArray = {"The", "cow" ,"jumped", "over", "the", "moon", "."}; List<CoreLabel> words = SentenceUtils.toCoreLabelList(sentenceArray); Tree parseTree = lexicalizedParser.apply(words); parseTree.pennPrint(); TreePrint treePrint = new TreePrint("typedDependenciesCollapsed"); treePrint.printTree(parseTree); String sentence = "The cow jumped over the moon."; TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence)); List<CoreLabel> wordList = tokenizer.tokenize(); parseTree = lexicalizedParser.apply(wordList); TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); for(TypedDependency dependency : tdl) { System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName() + "] Dependent Word: [" + dependency.dep() + "]"); } }
Example #4
Source File: CoreNLPPreprocessor.java From phrasal with GNU General Public License v3.0 | 5 votes |
@Override public Sequence<IString> process(String input) { String tokenizerInput = toUncased(input.trim()); Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput)); List<String> outputStrings = new ArrayList<>(); while (tokenizer.hasNext()) { String string = tokenizer.next().get(TextAnnotation.class); outputStrings.add(string); } Sequence<IString> rv = IStrings.toIStringSequence(outputStrings); if(compoundSplitter != null) rv = compoundSplitter.process(rv); return rv; }
Example #5
Source File: StanfordParser.java From OpenEphyra with GNU General Public License v2.0 | 5 votes |
/** * Parses a sentence and returns a string representation of the parse tree. * * @param sentence * a sentence * @return Tree whose Label is a MapLabel containing correct begin and end * character offsets in keys BEGIN_KEY and END_KEY */ @SuppressWarnings("unchecked") public static String parse(String sentence) { if (tlp == null || parser == null) throw new RuntimeException("Parser has not been initialized"); // parse the sentence to produce stanford Tree log.debug("Parsing sentence"); Tree tree = null; synchronized (parser) { Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer( new StringReader(sentence)); List<Word> words = tokenizer.tokenize(); log.debug("Tokenization: " + words); parser.parse(new Sentence(words)); tree = parser.getBestParse(); } // label tree with character extents // log.debug("Setting character extents"); // updateTreeLabels(tree, tree, new MutableInteger(), new // MutableInteger(-1)); // log.debug("Creating offset mapping"); // List<RangeMap> mapping = createMapping(sentence); // log.debug(mapping.toString()); // log.debug("Applying offset mapping"); // mapOffsets(tree, mapping); return tree.toString().replaceAll(" \\[[\\S]+\\]", ""); }
Example #6
Source File: Chapter7.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 4 votes |
private static void usingStanfordLexicalizedParser() { String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz"; LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel); // This option shows parsing a list of correctly tokenized words System.out.println("---First option"); String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."}; List<CoreLabel> words = Sentence.toCoreLabelList(senetenceArray); Tree parseTree = lexicalizedParser.apply(words); parseTree.pennPrint(); System.out.println(); // This option shows loading and using an explicit tokenizer System.out.println("---Second option"); String sentence = "The cow jumped over the moon."; TokenizerFactory<CoreLabel> tokenizerFactory = PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); Tokenizer<CoreLabel> tokenizer = tokenizerFactory.getTokenizer(new StringReader(sentence)); List<CoreLabel> wordList = tokenizer.tokenize(); parseTree = lexicalizedParser.apply(wordList); TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory(); GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree); List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); System.out.println(tdl); for (TypedDependency dependency : tdl) { System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName() + "] Dependent Word: [" + dependency.dep() + "]"); } System.out.println(); // You can also use a TreePrint object to print trees and dependencies // System.out.println("---Using TreePrint"); // TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed"); // treePrint.printTree(parseTree); // System.out.println("TreePrint Formats"); // for (String format : TreePrint.outputTreeFormats) { // System.out.println(format); // } // System.out.println(); }