opennlp.tools.tokenize.SimpleTokenizer#INSTANCE

Source File: Chapter1.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void nameFinderExample() {
    try {
        String[] sentences = {
            "Tim was a good neighbor. Perhaps not as good a Bob "
            + "Haywood, but still pretty good. Of course Mr. Adam "
            + "took the cake!"};
        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
        TokenNameFinderModel model = new TokenNameFinderModel(new File(
                "C:\\OpenNLP Models", "en-ner-person.bin"));
        NameFinderME finder = new NameFinderME(model);

        for (String sentence : sentences) {
            // Split the sentence into tokens
            String[] tokens = tokenizer.tokenize(sentence);

            // Find the names in the tokens and return Span objects
            Span[] nameSpans = finder.find(tokens);

            // Print the names extracted from the tokens using the Span data
            System.out.println(Arrays.toString(
                    Span.spansToStrings(nameSpans, tokens)));
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: OpenNlpTartarus.java From scava with Eclipse Public License 2.0

6 votes

public OpenNlpTartarus() {
		
		logger = (OssmeterLogger) OssmeterLogger.getLogger("uk.ac.nactem.posstemmer");
		
		ClassLoader cl = getClass().getClassLoader();
		try {
			posTaggerME = loadPoSME(cl, "models/en-pos-maxent.bin");
			simpleTokenizer = SimpleTokenizer.INSTANCE;
			SentenceModel sentenceModel = loadSentenceModel(cl, "models/en-sent.bin");
			sentenceDetector = new SentenceDetectorME(sentenceModel);
			logger.info("Models have been sucessfully loaded");
		} catch (IOException e) {
			logger.error("Error while loading the model:", e);
			e.printStackTrace();
		}

//		InputStream tokenizerModelInput = loadModelInput("models/en-token.bin");
//		TokenizerModel tokenizerModel = loadTokenizerModel(tokenizerModelInput);
//		tokenizerME = new TokenizerME(tokenizerModel);


		stemmer = new englishStemmer();
	}

Source File: LemmetizerUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenEnglishDictionary_whenLemmatize_thenLemmasAreDetected() throws Exception {

    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John has a sister named Penny.");

    InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
    POSModel posModel = new POSModel(inputStreamPOSTagger);
    POSTaggerME posTagger = new POSTaggerME(posModel);
    String tags[] = posTagger.tag(tokens);
    InputStream dictLemmatizer = getClass().getResourceAsStream("/models/en-lemmatizer.dict");
    DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
    String[] lemmas = lemmatizer.lemmatize(tokens, tags);

    assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O");
}

Source File: NamedEntityRecognitionUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
    
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
    
    InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
    NameFinderME nameFinderME = new NameFinderME(model);
    List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
    assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
    List<String> names = new ArrayList<String>();
    int k = 0;
    for (Span s : spans) {
        names.add("");
        for (int index = s.getStart(); index < s.getEnd(); index++) {
            names.set(k, names.get(k) + tokens[index]);
        }
        k++;
    }
    assertThat(names).contains("John","Leonard","Penny");
}

Source File: ChunkerUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenChunkerModel_whenChunk_thenChunksAreDetected() throws Exception {

    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion.");

    InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
    POSModel posModel = new POSModel(inputStreamPOSTagger);
    POSTaggerME posTagger = new POSTaggerME(posModel);
    String tags[] = posTagger.tag(tokens);

    InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin");
    ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker);
    ChunkerME chunker = new ChunkerME(chunkerModel);
    String[] chunks = chunker.chunk(tokens, tags);
    assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O");
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingTheSimpleTokenizerClass() {
    System.out.println("--- SimpleTokenizer");
    SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
    String tokens[] = simpleTokenizer.tokenize(paragraph);
    for (String token : tokens) {
        System.out.println(token);
    }
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingStopWordsClassExample() {
    StopWords stopWords = new StopWords("stopwords.txt");
    SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
    paragraph = "A simple approach is to create a class "
            + "to hold and remove stopwords.";
    String tokens[] = simpleTokenizer.tokenize(paragraph);
    String list[] = stopWords.removeStopWords(tokens);
    for (String word : list) {
        System.out.println(word);
    }
    stopWords.displayStopWords();
}

Source File: Chapter7.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static List<President> createPresidentList() {
    ArrayList<President> list = new ArrayList<>();
    String line = null;
    try (FileReader reader = new FileReader("PresidentList");
            BufferedReader br = new BufferedReader(reader)) {
        while ((line = br.readLine()) != null) {
            SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
            String tokens[] = simpleTokenizer.tokenize(line);
            String name = "";
            String start = "";
            String end = "";
            int i = 0;
            while (!"(".equals(tokens[i])) {
                name += tokens[i] + " ";
                i++;
            }
            start = tokens[i + 1];
            end = tokens[i + 3];
            if (end.equalsIgnoreCase("present")) {
                end = start;
            }
            list.add(new President(name, Integer.parseInt(start),
                    Integer.parseInt(end)));
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    return list;
}

Source File: NameFilter.java From wiseowl with MIT License

5 votes

public NameFilter(TokenStream in,String[] modelNames, NameFinderME[] finders) {
  super(in);
  this.tokenizer = SimpleTokenizer.INSTANCE;
  this.finders = finders;
  this.tokenTypeNames = new String[modelNames.length];
  for (int i=0; i < modelNames.length; i++) {
    tokenTypeNames[i] = NE_PREFIX + modelNames[i];
  }
}

Source File: SlotMatcherTests.java From Mutters with Apache License 2.0

5 votes

@Test
public void when_default_slot_does_not_match_return_default_value()
{
  OpenNLPSlotMatcher slotMatcher = new OpenNLPSlotMatcher(new OpenNLPTokenizer(SimpleTokenizer.INSTANCE));
  slotMatcher.addSlotModel("testSlot", "models/en-ner-persons.bin");
  Intent testIntent = new Intent("testIntent");
  TestSlot testSlot = new TestSlot("testSlot");
  testIntent.addSlot(testSlot);
  Map<Slot<?>, SlotMatch<?>> slotSlotMatchHashMap = slotMatcher.match(new Context(), testIntent, "testUtterance");

  SlotMatch<?> slotMatch = slotSlotMatchHashMap.get(testSlot);
  assertThat(slotMatch.getValue(), is("Default value"));
}

Source File: NERScorer.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}

Source File: POSTaggerUnitTest.java From tutorials with MIT License

5 votes

@Test
public void givenPOSModel_whenPOSTagging_thenPOSAreDetected() throws Exception {

    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John has a sister named Penny.");

    InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
    POSModel posModel = new POSModel(inputStreamPOSTagger);
    POSTaggerME posTagger = new POSTaggerME(posModel);
    String tags[] = posTagger.tag(tokens);
    assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", ".");
}

Source File: TokenizerUnitTest.java From tutorials with MIT License

4 votes

@Test
public void givenSimpleTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
    assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}

Java Code Examples for opennlp.tools.tokenize.SimpleTokenizer#INSTANCE