edu.stanford.nlp.process.TokenizerFactory Java Exaples

Source File: Main.java From dependensee with GNU General Public License v2.0

6 votes

public static void writeImage(String sentence, String outFile, int scale) throws Exception {
    
    LexicalizedParser lp = null;
    try {
        lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    } catch (Exception e) {
        System.err.println("Could not load file englishPCFG.ser.gz. Try placing this file in the same directory as Dependencee.jar");
        return;
    }
    
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree tree = lp.apply(wordList);
    writeImage(tree, outFile, scale);
    
}

Source File: MainTest.java From dependensee with GNU General Public License v2.0

6 votes

/**
 * Test of writeImage method, of class Main.
 */

@Test
public void testWriteImage() throws Exception {
    String text = "A quick brown fox jumped over the lazy dog.";
    TreebankLanguagePack tlp = new PennTreebankLanguagePack();
    GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
    LexicalizedParser lp = LexicalizedParser.loadModel();
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(text)).tokenize();
    Tree tree = lp.apply(wordList);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependenciesCollapsed();
    Main.writeImage(tdl, "image.png", 3);
    assert (new File("image.png").exists());
}

Source File: ClausIE.java From ambiverse-nlu with Apache License 2.0

5 votes

public ClausIE(LexicalizedParser lp, TokenizerFactory<CoreLabel> tokenizerFactory, LexicalizedParserQuery lpq)
    throws IOException, URISyntaxException {
  this(new Options());
  this.lp = lp;
  this.tokenizerFactory = tokenizerFactory;
  this.lpq = lpq;
}

Source File: StanfordLexicalDemo.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

public static void main(String args[]){
    String parseModel = getResourcePath() + "englishPCFG.ser.gz";
    LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parseModel);
    String [] sentenceArray = {"The", "cow" ,"jumped", "over", "the", "moon", "."};
    List<CoreLabel> words = SentenceUtils.toCoreLabelList(sentenceArray);
    Tree parseTree = lexicalizedParser.apply(words); 
    parseTree.pennPrint(); 
    
    TreePrint treePrint =  new TreePrint("typedDependenciesCollapsed"); 
    treePrint.printTree(parseTree); 
    
    
    String sentence = "The cow jumped over the moon."; 
    TokenizerFactory<CoreLabel> tokenizerFactory =  PTBTokenizer.factory(new CoreLabelTokenFactory(), ""); 
    Tokenizer<CoreLabel> tokenizer =  tokenizerFactory.getTokenizer(new StringReader(sentence)); 
    List<CoreLabel> wordList = tokenizer.tokenize(); 
    parseTree = lexicalizedParser.apply(wordList); 
    TreebankLanguagePack tlp =  lexicalizedParser.treebankLanguagePack(); 
    GrammaticalStructureFactory gsf =  tlp.grammaticalStructureFactory(); 
    GrammaticalStructure gs =  gsf.newGrammaticalStructure(parseTree); 
    List<TypedDependency> tdl = gs.typedDependenciesCCprocessed(); 
    System.out.println(tdl); 
    
    for(TypedDependency dependency : tdl) { 
        System.out.println("Governor Word: [" + dependency.gov()  
            + "] Relation: [" + dependency.reln().getLongName() 
            + "] Dependent Word: [" + dependency.dep() + "]"); 
    } 
    
}

Source File: CoreNLPTokenAnnotator.java From modernmt with Apache License 2.0

5 votes

public static CoreNLPTokenAnnotator forLanguage(Language language) throws UnsupportedLanguageException {
    TokenizerFactory<?> factory = FACTORIES.get(language);
    if (factory == null)
        throw new UnsupportedLanguageException(language);

    /*sets special options if source language is English*/
    if (Language.ENGLISH.getLanguage().equals(language.getLanguage()))
        factory.setOptions("ptb3Escaping=false,asciiQuotes=true,normalizeSpace=false");

    return new CoreNLPTokenAnnotator(factory);
}

Source File: Main.java From dependensee with GNU General Public License v2.0

5 votes

public static Graph getGraph(String sentence) throws Exception {
    LexicalizedParser lp = LexicalizedParser.loadModel("edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz");
    lp.setOptionFlags(new String[]{"-maxLength", "500", "-retainTmpSubcategories"});
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree tree = lp.apply(wordList);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependencies();
    return getGraph(tree, tdl);
}

Source File: Main.java From dependensee with GNU General Public License v2.0

5 votes

public static Graph getGraph(String sentence, LexicalizedParser lp) throws Exception {
    TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
    Tree tree = lp.apply(wordList);
    GrammaticalStructure gs = gsf.newGrammaticalStructure(tree);
    Collection<TypedDependency> tdl = gs.typedDependencies();
    return getGraph(tree, tdl);
}

Source File: Main.java From dependensee with GNU General Public License v2.0

5 votes

public static void writeImage(String sentence, String outFile, LexicalizedParser lp) throws Exception {
    
    Tree parse;
    try {
        TokenizerFactory<CoreLabel> tokenizerFactory =
            PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
    List<CoreLabel> wordList = tokenizerFactory.getTokenizer(new StringReader(sentence)).tokenize();
        parse = lp.apply(wordList);            
    } catch (Exception e) {
        throw e;
    }
    writeImage(parse, outFile);
    
}

Source File: Chapter7.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

private static void usingStanfordLexicalizedParser() {
        String parserModel = "C:/Current Books in Progress/NLP and Java/Models/edu/stanford/nlp/models/lexparser/englishPCFG.ser.gz";
        LexicalizedParser lexicalizedParser = LexicalizedParser.loadModel(parserModel);

        // This option shows parsing a list of correctly tokenized words
        System.out.println("---First option");
        String[] senetenceArray = {"The", "cow", "jumped", "over", "the", "moon", "."};
        List<CoreLabel> words = Sentence.toCoreLabelList(senetenceArray);

        Tree parseTree = lexicalizedParser.apply(words);
        parseTree.pennPrint();
        System.out.println();

        // This option shows loading and using an explicit tokenizer
        System.out.println("---Second option");
        String sentence = "The cow jumped over the moon.";
        TokenizerFactory<CoreLabel> tokenizerFactory
                = PTBTokenizer.factory(new CoreLabelTokenFactory(), "");
        Tokenizer<CoreLabel> tokenizer
                = tokenizerFactory.getTokenizer(new StringReader(sentence));
        List<CoreLabel> wordList = tokenizer.tokenize();
        parseTree = lexicalizedParser.apply(wordList);

        TreebankLanguagePack tlp = lexicalizedParser.treebankLanguagePack(); // PennTreebankLanguagePack for English
        GrammaticalStructureFactory gsf = tlp.grammaticalStructureFactory();
        GrammaticalStructure gs = gsf.newGrammaticalStructure(parseTree);
        List<TypedDependency> tdl = gs.typedDependenciesCCprocessed();
        System.out.println(tdl);
        for (TypedDependency dependency : tdl) {
            System.out.println("Governor Word: [" + dependency.gov() + "] Relation: [" + dependency.reln().getLongName()
                    + "] Dependent Word: [" + dependency.dep() + "]");
        }
        System.out.println();

        // You can also use a TreePrint object to print trees and dependencies
//        System.out.println("---Using TreePrint");
//        TreePrint treePrint = new TreePrint("penn,typedDependenciesCollapsed");
//        treePrint.printTree(parseTree);
//        System.out.println("TreePrint Formats");
//        for (String format : TreePrint.outputTreeFormats) {
//            System.out.println(format);
//        }
//        System.out.println();
    }

Source File: CoreNLPTokenAnnotator.java From modernmt with Apache License 2.0

4 votes

private CoreNLPTokenAnnotator(TokenizerFactory<?> factory) {
    this.factory = factory;
}

Source File: CoreNLPPreprocessor.java From phrasal with GNU General Public License v3.0

4 votes

public CoreNLPPreprocessor(TokenizerFactory<CoreLabel> tf) {
  this(tf, null);
}

Source File: CoreNLPPreprocessor.java From phrasal with GNU General Public License v3.0

4 votes

public CoreNLPPreprocessor(TokenizerFactory<CoreLabel> tf, MosesCompoundSplitter compoundSplitter) {
  this.tf = tf;
  this.compoundSplitter = compoundSplitter;
}

Source File: CoreNLPPreprocessor.java From phrasal with GNU General Public License v3.0

2 votes

/**
 * Get the underlying tokenizer factory.
 * 
 * @return
 */
public TokenizerFactory<CoreLabel> getTokenizerFactory() { return tf; }

edu.stanford.nlp.process.TokenizerFactory Java Examples