opennlp.tools.tokenize.Tokenizer Java Exaples

Source File: Chapter1.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void nameFinderExample() {
    try {
        String[] sentences = {
            "Tim was a good neighbor. Perhaps not as good a Bob "
            + "Haywood, but still pretty good. Of course Mr. Adam "
            + "took the cake!"};
        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
        TokenNameFinderModel model = new TokenNameFinderModel(new File(
                "C:\\OpenNLP Models", "en-ner-person.bin"));
        NameFinderME finder = new NameFinderME(model);

        for (String sentence : sentences) {
            // Split the sentence into tokens
            String[] tokens = tokenizer.tokenize(sentence);

            // Find the names in the tokens and return Span objects
            Span[] nameSpans = finder.find(tokens);

            // Print the names extracted from the tokens using the Span data
            System.out.println(Arrays.toString(
                    Span.spansToStrings(nameSpans, tokens)));
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void trainingATokenizer() {
    createOpenNLPModel();
    try {
        paragraph = "A demonstration of how to train a tokenizer.";
        InputStream modelInputStream = new FileInputStream(new File(
                ".", "mymodel.bin"));
        TokenizerModel model = new TokenizerModel(modelInputStream);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(paragraph);
        for (String token : tokens) {
            System.out.println(token);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: OpenNLPAnnotator.java From Stargraph with MIT License

6 votes

@Override
public List<Word> doRun(Language language, String sentence) {
    Tokenizer tokenizer = new TokenizerME(getTokenizerModel(language));
    POSTaggerME tagger = new POSTaggerME(getPOSModel(language));
    String[] tokens = tokenizer.tokenize(sentence);
    String[] tags = tagger.tag(tokens);

    PartOfSpeechSet posSet = PartOfSpeechSet.getPOSSet(language);

    List<Word> words = new ArrayList<>();
    for (int i = 0; i < tokens.length; i++) {
        words.add(new Word(posSet.valueOf(tags[i]), tokens[i]));
    }

    return words;
}

Source File: JM_Scorer.java From uncc2014watsonsim with GNU General Public License v2.0

6 votes

public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{
	POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin"))));
	Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin"))));
	Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin"))));
	double score = 0;
	
	Parse[] questionParse = ParserTool.parseLine(q, parser, 1);
	Parse[] passageParse = ParserTool.parseLine(q, parser, 1);
	
	if (passage.contains(ca)) {
		for (int i =0; i < questionParse.length; i++) {
			score += matchChildren(questionParse[i],passageParse[i]);
		}
	}
	
	return score;
}

Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingMultipleNERModels() {
    // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
    // en-ner-organization.bin en-ner-time.bin
    try {
        InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));

        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);

        String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
            "en-ner-organization.bin"};
        ArrayList<String> list = new ArrayList();
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(
                    new FileInputStream(
                            new File(getModelDir(), name)));
            NameFinderME nameFinder = new NameFinderME(entityModel);
            for (int index = 0; index < sentences.length; index++) {
                String tokens[] = tokenizer.tokenize(sentences[index]);
                Span nameSpans[] = nameFinder.find(tokens);
                for (Span span : nameSpans) {
                    list.add("Sentence: " + index
                            + " Span: " + span.toString() + " Entity: "
                            + tokens[span.getStart()]);
                }
            }
        }
        System.out.println("Multiple Entities");
        for (String element : list) {
            System.out.println(element);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingMultipleNERModels() {
    // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
    // en-ner-organization.bin en-ner-time.bin
    try {
        InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));

        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);

        String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
            "en-ner-organization.bin"};
        ArrayList<String> list = new ArrayList();
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(
                    new FileInputStream(
                            new File(getModelDir(), name)));
            NameFinderME nameFinder = new NameFinderME(entityModel);
            for (int index = 0; index < sentences.length; index++) {
                String tokens[] = tokenizer.tokenize(sentences[index]);
                Span nameSpans[] = nameFinder.find(tokens);
                for (Span span : nameSpans) {
                    list.add("Sentence: " + index
                            + " Span: " + span.toString() + " Entity: "
                            + tokens[span.getStart()]);
                }
            }
        }
        System.out.println("Multiple Entities");
        for (String element : list) {
            System.out.println(element);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

Source File: Chapter1.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void lingpipeExamples() {
    List<String> tokenList = new ArrayList<>();
    List<String> whiteList = new ArrayList<>();
    String text = "A sample sentence processed \nby \tthe "
            + "LingPipe tokenizer.";
    com.aliasi.tokenizer.Tokenizer tokenizer = IndoEuropeanTokenizerFactory.INSTANCE.
            tokenizer(text.toCharArray(), 0, text.length());
    tokenizer.tokenize(tokenList, whiteList);
    for (String element : tokenList) {
        System.out.print(element + " ");
    }
    System.out.println();

}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingTheTokenizerMEClass() {
    try {
        InputStream modelIn = new FileInputStream(new File(
                getModelDir(), "en-token.bin"));
        TokenizerModel model = new TokenizerModel(modelIn);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(paragraph);
        for (String token : tokens) {
            System.out.println(token);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingLingPipeTokenizers() {
//        String paragraph = "sample text string";
        char text[] = paragraph.toCharArray();
        TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE;
        com.aliasi.tokenizer.Tokenizer tokenizer = tokenizerFactory.tokenizer(
                text, 0, text.length);
        for (String token : tokenizer) {
            System.out.println(token);
        }
    }

Source File: BasicActions.java From knowledge-extraction with Apache License 2.0

5 votes

public String[] testTokenizer(){
	String[] tokens = {};
	try (InputStream modelIn = BasicActions.class.getClassLoader()
			.getResourceAsStream(Consts.EN_TOKEN_MODEL);) {
		
		TokenizerModel tokenModel = new TokenizerModel(modelIn);
		Tokenizer tokenizer = new TokenizerME(tokenModel);
		tokens = tokenizer.tokenize(TEST_PHRASE);
		System.out.println(Arrays.toString(tokens));
	} catch (IOException e) {
		e.printStackTrace();
	}
	return tokens;
}

Source File: NERScorer.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}

Source File: OpenNlpModule.java From SciGraph with Apache License 2.0

5 votes

@CheckedProvides(TokenizerProvider.class)
Tokenizer getTokenizer() throws IOException {
  try (InputStream is = getClass().getResourceAsStream("/opennlp/en-token.bin")) {
    TokenizerModel model = new TokenizerModel(is);
    return new TokenizerME(model);
  }
}

Source File: NERDemo.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

public static void main(String args[]){
    String sentences[] = {"Joe was the last person to see Fred. ", 
        "He saw him in Boston at McKenzie's pub at 3:00 where he " 
        + " paid $2.45 for an ale. ", 
        "Joe wanted to go to Vermont for the day to visit a cousin who " 
        + "works at IBM, but Sally and he had to look for Fred"}; 
    String sentence = "He was the last person to see Fred."; 
    try
    {
        InputStream tokenStream = new FileInputStream(new File(getResourcePath()+ "en-token.bin"));
        InputStream modelStream = new FileInputStream(new File(getResourcePath() + "en-ner-person.bin"));
        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        TokenNameFinderModel entityModel = new TokenNameFinderModel(modelStream);
        NameFinderME nameFinder = new NameFinderME(entityModel);
        String tokens1[] = tokenizer.tokenize(sentence);
        Span nameSpans1[] = nameFinder.find(tokens1);
        for (int i = 0; i < nameSpans1.length; i++) { 
            System.out.println("Span: " + nameSpans1[i].toString()); 
            System.out.println("Entity: " 
                + tokens1[nameSpans1[i].getStart()]); 
        } 
        
        System.out.println("---------- Multiple Sentences -----------");
        for (String sentence1 : sentences) { 
            String tokens[] = tokenizer.tokenize(sentence1); 
            Span nameSpans[] = nameFinder.find(tokens); 
            for (int i = 0; i < nameSpans.length; i++) { 
                System.out.println("Span: " + nameSpans[i].toString()); 
                System.out.println("Entity: "  
                    + tokens[nameSpans[i].getStart()]); 
            } 
            System.out.println(); 
        } 
        
    }
    catch(Exception e){
        System.out.println(e);
    }
}

Source File: OpenNlpModule.java From SciGraph with Apache License 2.0

4 votes

@Override
Tokenizer get() throws IOException;

Source File: Model.java From DataDefender with Apache License 2.0

4 votes

public Model(final Tokenizer tokenizer, final NameFinderME nameFinder, final String name) {
    this.name       = name;
    this.tokenizer  = tokenizer;
    this.nameFinder = nameFinder;
}

Source File: Model.java From DataDefender with Apache License 2.0

4 votes

public Tokenizer getTokenizer() {
    return this.tokenizer;
}

opennlp.tools.tokenize.Tokenizer Java Examples