edu.stanford.nlp.process.Tokenizer Java Exaples

Source File: TypeClassifier.java From winter with Apache License 2.0

6 votes

/**
 * Initializes the tokenizer to detect date columns.
 */
public void initialize() {
	Properties props = new Properties();
	pipeline.addAnnotator(new TokenizerAnnotator(false) {

		@Override
		public Tokenizer<CoreLabel> getTokenizer(Reader r) {
			// TODO Auto-generated method stub
			return new PTBTokenizer<CoreLabel>(r, new CoreLabelTokenFactory(), "");

		}

	});
	pipeline.addAnnotator(new WordsToSentencesAnnotator(false));
	pipeline.addAnnotator(new POSTaggerAnnotator(false));
	pipeline.addAnnotator(new TimeAnnotator("sutime", props));
}

Source File: StanfordParser.java From OpenEphyra with GNU General Public License v2.0

6 votes

/**
 * Parses a sentence and returns the PCFG score as a confidence measure.
 * 
 * @param sentence
 *            a sentence
 * @return PCFG score
 */
@SuppressWarnings("unchecked")
public static double getPCFGScore(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce PCFG score
    log.debug("Parsing sentence");
    double score;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        score = parser.getPCFGScore();
    }

    return score;
}

Source File: StanfordLexicalDemo.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

public static void main(String args[]){
    String parseModel = getResourcePath() + "englishPCFG.ser.gz";
    LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parseModel);
    String [] sentenceArray = {"The", "cow" ,"jumped", "over", "the", "moon", "."};
    List<CoreLabel> words = SentenceUtils.toCoreLabelList(sentenceArray);
    Tree parseTree = lexicalizedParser.apply(words); 
    parseTree.pennPrint(); 
    
    TreePrint treePrint =  new TreePrint("typedDependenciesCollapsed"); 
    treePrint.printTree(parseTree); 
    
    
    String sentence = "The cow jumped over the moon."; 
    TokenizerFactory<CoreLabel> tokenizerFactory =  PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 
    Tokenizer<CoreLabel> tokenizer =  tokenizerFactory.getTokenizer(new StringReader(sentence)); 
    List<CoreLabel> wordList = tokenizer.tokenize(); 
    parseTree = lexicalizedParser.apply(wordList); 
    TreebankLanguagePack tlp =  lexicalizedParser.treebankLanguagePack(); 
    GrammaticalStructureFactory gsf =  tlp.grammaticalStructureFactory(); 
    GrammaticalStructure gs =  gsf.newGrammaticalStructure(parseTree); 
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); 
    System.out.println(tdl); 
    
    for(TypedDependency dependency : tdl) { 
        System.out.println("Governor Word: [" + dependency.gov()  
            + "] Relation: [" + dependency.reln().getLongName() 
            + "] Dependent Word: [" + dependency.dep() + "]"); 
    } 
    
}

Source File: CoreNLPPreprocessor.java From phrasal with GNU General Public License v3.0

5 votes

@Override
public Sequence<IString> process(String input) {
  String tokenizerInput = toUncased(input.trim());
  Tokenizer<CoreLabel> tokenizer = tf.getTokenizer(new StringReader(tokenizerInput));
  List<String> outputStrings = new ArrayList<>();
  while (tokenizer.hasNext()) {
    String string = tokenizer.next().get(TextAnnotation.class);
    outputStrings.add(string);
  }
  Sequence<IString> rv = IStrings.toIStringSequence(outputStrings);
  
  if(compoundSplitter != null) rv = compoundSplitter.process(rv);
  
  return rv;
}

Source File: StanfordParser.java From OpenEphyra with GNU General Public License v2.0

5 votes

/**
 * Parses a sentence and returns a string representation of the parse tree.
 * 
 * @param sentence
 *            a sentence
 * @return Tree whose Label is a MapLabel containing correct begin and end
 *         character offsets in keys BEGIN_KEY and END_KEY
 */
@SuppressWarnings("unchecked")
public static String parse(String sentence)
{
    if (tlp == null || parser == null)
        throw new RuntimeException("Parser has not been initialized");

    // parse the sentence to produce stanford Tree
    log.debug("Parsing sentence");
    Tree tree = null;
    synchronized (parser)
    {
        Tokenizer tokenizer = tlp.getTokenizerFactory().getTokenizer(
            new StringReader(sentence));
        List<Word> words = tokenizer.tokenize();
        log.debug("Tokenization: " + words);
        parser.parse(new Sentence(words));
        tree = parser.getBestParse();
    }

    // label tree with character extents
    // log.debug("Setting character extents");
    // updateTreeLabels(tree, tree, new MutableInteger(), new
    // MutableInteger(-1));
    // log.debug("Creating offset mapping");
    // List<RangeMap> mapping = createMapping(sentence);
    // log.debug(mapping.toString());
    // log.debug("Applying offset mapping");
    // mapOffsets(tree, mapping);

    return tree.toString().replaceAll(" \\[[\\S]+\\]", "");
}

Source File: Chapter7.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

private static void usingStanfordLexicalizedParser() {
        String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
        LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel);

        // This option shows parsing a list of correctly tokenized words
        System.out.println("---First option");
        String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."};
        List<CoreLabel> words = Sentence.toCoreLabelList(senetenceArray);

        Tree parseTree = lexicalizedParser.apply(words);
        parseTree.pennPrint();
        System.out.println();

        // This option shows loading and using an explicit tokenizer
        System.out.println("---Second option");
        String sentence = "The cow jumped over the moon.";
        TokenizerFactory<CoreLabel> tokenizerFactory
                = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        Tokenizer<CoreLabel> tokenizer
                = tokenizerFactory.getTokenizer(new StringReader(sentence));
        List<CoreLabel> wordList = tokenizer.tokenize();
        parseTree = lexicalizedParser.apply(wordList);

        TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
        System.out.println(tdl);
        for (TypedDependency dependency : tdl) {
            System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
                    + "] Dependent Word: [" + dependency.dep() + "]");
        }
        System.out.println();

        // You can also use a TreePrint object to print trees and dependencies
//        System.out.println("---Using TreePrint");
//        TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed");
//        treePrint.printTree(parseTree);
//        System.out.println("TreePrint Formats");
//        for (String format : TreePrint.outputTreeFormats) {
//            System.out.println(format);
//        }
//        System.out.println();
    }

edu.stanford.nlp.process.Tokenizer Java Examples