opennlp.tools.tokenize.SimpleTokenizer Java Exaples

Source File: Chapter1.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void nameFinderExample() {
    try {
        String[] sentences = {
            "Tim was a good neighbor. Perhaps not as good a Bob "
            + "Haywood, but still pretty good. Of course Mr. Adam "
            + "took the cake!"};
        Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
        TokenNameFinderModel model = new TokenNameFinderModel(new File(
                "C:\\OpenNLP Models", "en-ner-person.bin"));
        NameFinderME finder = new NameFinderME(model);

        for (String sentence : sentences) {
            // Split the sentence into tokens
            String[] tokens = tokenizer.tokenize(sentence);

            // Find the names in the tokens and return Span objects
            Span[] nameSpans = finder.find(tokens);

            // Print the names extracted from the tokens using the Span data
            System.out.println(Arrays.toString(
                    Span.spansToStrings(nameSpans, tokens)));
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: POSStructureScorer.java From uncc2014watsonsim with GNU General Public License v2.0

6 votes

public static Parse[] parsePassageText(String p) throws InvalidFormatException{
	
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel);
	Parser parser = ParserFactory.create(
			parserModel,
			20, // beam size
			0.95); // advance percentage
 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);


		String sent= StringUtils.join(tks," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}

Source File: OpenNlpTests.java From uncc2014watsonsim with GNU General Public License v2.0

6 votes

public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best

		String sent= StringUtils.join(tks," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}

Source File: DocumentType.java From baleen with Apache License 2.0

6 votes

@Override
public void doProcess(JCas jCas) throws AnalysisEngineProcessException {
  DocumentAnnotation da = getDocumentAnnotation(jCas);

  if (Strings.isNullOrEmpty(da.getDocType())) {
    double[] outcomes =
        doccat.categorize(SimpleTokenizer.INSTANCE.tokenize(jCas.getDocumentText()));
    String cat = doccat.getBestCategory(outcomes);

    double max = -Double.MAX_VALUE;
    for (double d : outcomes) {
      if (d > max) {
        max = d;
      }
    }

    if (threshold != null && max > threshold) {
      da.setDocType(cat);
    }

  } else {
    getMonitor()
        .warn("A DocType annotation already exists. A second annotation will not be added.");
  }
}

Source File: TestNER.java From Mutters with Apache License 2.0

6 votes

@Test
public void testAddressNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-address.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE.tokenize("Send a taxi to 12 Pleasent Street");
  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(1));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(1));
  assertThat(locations[0], is("12 Pleasent Street"));
}

Source File: TestNER.java From Mutters with Apache License 2.0

6 votes

@Test
public void testDateNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-dates.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(1));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(1));
  assertThat(locations[0], is("today"));
}

Source File: TestNER.java From Mutters with Apache License 2.0

6 votes

@Test
public void testLocationNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-locations.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(2));

  String[] locations = Span.spansToStrings(spans, tokens);
  assertThat(locations.length, is(2));
  assertThat(locations[0], is("New York"));
  assertThat(locations[1], is("London"));
}

Source File: TestNER.java From Mutters with Apache License 2.0

6 votes

@Test
public void testPersonNER() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-persons.bin");
  assertThat(modelUrl, is(notNullValue()));

  TokenNameFinderModel model = new TokenNameFinderModel(modelUrl);
  assertThat(model, is(notNullValue()));

  NameFinderME nameFinder = new NameFinderME(model);
  String[] tokens = SimpleTokenizer.INSTANCE
      .tokenize("Mr. John Smith of New York, married Anne Green of London today.");
  assertThat(tokens.length, is(15));

  Span[] spans = nameFinder.find(tokens);
  assertThat(spans.length, is(2));

  String[] names = Span.spansToStrings(spans, tokens);
  assertThat(names.length, is(2));
  assertThat(names[0], is("John Smith"));
  assertThat(names[1], is("Anne Green"));
}

Source File: OpenNlpService.java From elasticsearch-ingest-opennlp with Apache License 2.0

6 votes

public ExtractedEntities find(String content, String field) {
    try {
        if (!nameFinderModels.containsKey(field)) {
            throw new ElasticsearchException("Could not find field [{}], possible values {}", field, nameFinderModels.keySet());
        }
        TokenNameFinderModel finderModel = nameFinderModels.get(field);
        if (threadLocal.get() == null || !threadLocal.get().equals(finderModel)) {
            threadLocal.set(finderModel);
        }

        String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content);
        Span[] spans = new NameFinderME(finderModel).find(tokens);

        return new ExtractedEntities(tokens, spans);
    } finally {
        threadLocal.remove();
    }
}

Source File: OpenNlpTartarus.java From scava with Eclipse Public License 2.0

6 votes

public OpenNlpTartarus() {
		
		logger = (OssmeterLogger) OssmeterLogger.getLogger("uk.ac.nactem.posstemmer");
		
		ClassLoader cl = getClass().getClassLoader();
		try {
			posTaggerME = loadPoSME(cl, "models/en-pos-maxent.bin");
			simpleTokenizer = SimpleTokenizer.INSTANCE;
			SentenceModel sentenceModel = loadSentenceModel(cl, "models/en-sent.bin");
			sentenceDetector = new SentenceDetectorME(sentenceModel);
			logger.info("Models have been sucessfully loaded");
		} catch (IOException e) {
			logger.error("Error while loading the model:", e);
			e.printStackTrace();
		}

//		InputStream tokenizerModelInput = loadModelInput("models/en-token.bin");
//		TokenizerModel tokenizerModel = loadTokenizerModel(tokenizerModelInput);
//		tokenizerME = new TokenizerME(tokenizerModel);


		stemmer = new englishStemmer();
	}

Source File: LemmetizerUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenEnglishDictionary_whenLemmatize_thenLemmasAreDetected() throws Exception {

    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John has a sister named Penny.");

    InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
    POSModel posModel = new POSModel(inputStreamPOSTagger);
    POSTaggerME posTagger = new POSTaggerME(posModel);
    String tags[] = posTagger.tag(tokens);
    InputStream dictLemmatizer = getClass().getResourceAsStream("/models/en-lemmatizer.dict");
    DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer);
    String[] lemmas = lemmatizer.lemmatize(tokens, tags);

    assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O");
}

Source File: NamedEntityRecognitionUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception {
    
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny.");
    
    InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin");
    TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder);
    NameFinderME nameFinderME = new NameFinderME(model);
    List<Span> spans = Arrays.asList(nameFinderME.find(tokens));
    assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]");
    List<String> names = new ArrayList<String>();
    int k = 0;
    for (Span s : spans) {
        names.add("");
        for (int index = s.getStart(); index < s.getEnd(); index++) {
            names.set(k, names.get(k) + tokens[index]);
        }
        k++;
    }
    assertThat(names).contains("John","Leonard","Penny");
}

Source File: ChunkerUnitTest.java From tutorials with MIT License

6 votes

@Test
public void givenChunkerModel_whenChunk_thenChunksAreDetected() throws Exception {

    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion.");

    InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
    POSModel posModel = new POSModel(inputStreamPOSTagger);
    POSTaggerME posTagger = new POSTaggerME(posModel);
    String tags[] = posTagger.tag(tokens);

    InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin");
    ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker);
    ChunkerME chunker = new ChunkerME(chunkerModel);
    String[] chunks = chunker.chunk(tokens, tags);
    assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O");
}

Source File: OpenNlpTests.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

public void taggerTest(){
	String[] words = SimpleTokenizer.INSTANCE.tokenize(
			"The quick, red fox jumped over the lazy, brown dogs.");
	String[] result = tagger.tag(words);
	for (int i=0 ; i < words.length; i++) {
		System.err.print(words[i] + "/" + result[i] + " ");
	}
	System.err.println("n");
}

Source File: POSTaggerUnitTest.java From tutorials with MIT License

5 votes

@Test
public void givenPOSModel_whenPOSTagging_thenPOSAreDetected() throws Exception {

    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("John has a sister named Penny.");

    InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin");
    POSModel posModel = new POSModel(inputStreamPOSTagger);
    POSTaggerME posTagger = new POSTaggerME(posModel);
    String tags[] = posTagger.tag(tokens);
    assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", ".");
}

Source File: POSStructureScorer.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

private static int[] POSScoreSentece(String sampleQACombined) {
	// TODO Auto-generated method stub
	
	
	int[] scorerModel = { 0, 0, 0, 0, 0, 0 };
	String[] words = SimpleTokenizer.INSTANCE.tokenize(sampleQACombined);
	String[] result = tagger.tag(words);
	for (int i = 0; i < result.length; i++) {
		System.out.println(result[i]);
	}
	for (int i=0 ; i < words.length; i++) {
		if(result[i].equals("CD")){
			scorerModel[0]++;
		}else if(result[i].equals("EX")){
			scorerModel[1]++;
		}else if(result[i].equals("JJ") || result[i].equals("JJR") || result[i].equals("JJS")){
			
			scorerModel[2]++;
		}else if(result[i].equals("NN") || result[i].equals("NNS") || result[i].equals("NNP") || result[i].equals("NNPS")){
			scorerModel[3]++;
		}else if(result[i].equals("RB") || result[i].equals("RBR") || result[i].equals("RBS")){
			scorerModel[4]++;
		}else if(result[i].equals("VB") || result[i].equals("VBD") || result[i].equals("VBG") || result[i].equals("VBN") || result[i].equals("VBP") || result[i].equals("VBZ")){
			scorerModel[5]++;
		}
	}
	return scorerModel;
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingTheSimpleTokenizerClass() {
    System.out.println("--- SimpleTokenizer");
    SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
    String tokens[] = simpleTokenizer.tokenize(paragraph);
    for (String token : tokens) {
        System.out.println(token);
    }
}

Source File: NERScorer.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

public void taggerTest(){
	String[] words = SimpleTokenizer.INSTANCE.tokenize(
			"The quick, red fox jumped over the lazy, brown dogs.");
	String[] result = tagger.tag(words);
	for (int i=0 ; i < words.length; i++) {
		System.err.print(words[i] + "/" + result[i] + " ");
	}
	System.err.println("n");
}

Source File: NERScorer.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

public Parse[] parsePassageText(String p) throws InvalidFormatException{
	if (!modelsAreInitialized)init();
	//initialize 	 
	SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel);
	NameFinderME nameFinder = new NameFinderME(this.nerModel);
	Parser parser = ParserFactory.create(
			this.parserModel,
			20, // beam size
			0.95); // advance percentage
	//find sentences, tokenize each, parse each, return top parse for each 	 	 
	String[] sentences = sentenceDetector.sentDetect(p);
	Parse[] results = new Parse[sentences.length];
	for (int i=0;i<sentences.length;i++){
		//String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]);
		
		//StringTokenizer st = new StringTokenizer(tks[i]); 
		//There are several tokenizers available. SimpleTokenizer works best
		Tokenizer tokenizer = SimpleTokenizer.INSTANCE;
		for (int si = 0; si < sentences.length; si++) {
	        Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]);
	        String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]);
	        Span[] names = nameFinder.find(tokens);
	        for (int ni = 0; ni < names.length; ni++) {
	            Span startSpan = tokenSpans[names[ni].getStart()];
	            int nameStart = startSpan.getStart();
	            Span endSpan = tokenSpans[names[ni].getEnd() - 1];
	            int nameEnd = endSpan.getEnd();
	            String name = sentences[si].substring(nameStart, nameEnd);
	            System.out.println(name);
	        }
	    }
		String sent= StringUtils.join(tokenizer," ");
		System.out.println("Found sentence " + sent);
		Parse[] sentResults = ParserTool.parseLine(sent,parser, 1);
		results[i]=sentResults[0];
	}
	return results;
}

Source File: SentenceSimilarity.java From uncc2014watsonsim with GNU General Public License v2.0

5 votes

/** Tokenize a paragraph into sentences, then into words. */
public List<List<String>> tokenizeParagraph(String paragraph) {
	List<List<String>> results = new ArrayList<>();
	// Find sentences, tokenize each, parse each, return top parse for each
	for (String unsplit_sentence : sentenceDetector.sentDetect(paragraph)) {
		results.add(Arrays.asList(
				SimpleTokenizer.INSTANCE.tokenize(unsplit_sentence)
				));
	}
	return results;
}

Source File: SlotMatcherTests.java From Mutters with Apache License 2.0

5 votes

@Test
public void when_default_slot_does_not_match_return_default_value()
{
  OpenNLPSlotMatcher slotMatcher = new OpenNLPSlotMatcher(new OpenNLPTokenizer(SimpleTokenizer.INSTANCE));
  slotMatcher.addSlotModel("testSlot", "models/en-ner-persons.bin");
  Intent testIntent = new Intent("testIntent");
  TestSlot testSlot = new TestSlot("testSlot");
  testIntent.addSlot(testSlot);
  Map<Slot<?>, SlotMatch<?>> slotSlotMatchHashMap = slotMatcher.match(new Context(), testIntent, "testUtterance");

  SlotMatch<?> slotMatch = slotSlotMatchHashMap.get(testSlot);
  assertThat(slotMatch.getValue(), is("Default value"));
}

Source File: NameFilter.java From wiseowl with MIT License

5 votes

public NameFilter(TokenStream in,String[] modelNames, NameFinderME[] finders) {
  super(in);
  this.tokenizer = SimpleTokenizer.INSTANCE;
  this.finders = finders;
  this.tokenTypeNames = new String[modelNames.length];
  for (int i=0; i < modelNames.length; i++) {
    tokenTypeNames[i] = NE_PREFIX + modelNames[i];
  }
}

Source File: Chapter7.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static List<President> createPresidentList() {
    ArrayList<President> list = new ArrayList<>();
    String line = null;
    try (FileReader reader = new FileReader("PresidentList");
            BufferedReader br = new BufferedReader(reader)) {
        while ((line = br.readLine()) != null) {
            SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
            String tokens[] = simpleTokenizer.tokenize(line);
            String name = "";
            String start = "";
            String end = "";
            int i = 0;
            while (!"(".equals(tokens[i])) {
                name += tokens[i] + " ";
                i++;
            }
            start = tokens[i + 1];
            end = tokens[i + 3];
            if (end.equalsIgnoreCase("present")) {
                end = start;
            }
            list.add(new President(name, Integer.parseInt(start),
                    Integer.parseInt(end)));
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
    return list;
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingStopWordsClassExample() {
    StopWords stopWords = new StopWords("stopwords.txt");
    SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE;
    paragraph = "A simple approach is to create a class "
            + "to hold and remove stopwords.";
    String tokens[] = simpleTokenizer.tokenize(paragraph);
    String list[] = stopWords.removeStopWords(tokens);
    for (String word : list) {
        System.out.println(word);
    }
    stopWords.displayStopWords();
}

Source File: OpenNlpService.java From elasticsearch-ingest-opennlp with Apache License 2.0

4 votes

static String createAnnotatedText(String content, List<ExtractedEntities> extractedEntities) {
    // these spans contain the real offset of each word in start/end variables!
    // the spans of the method argument contain the offset of each token, as mentioned in tokens!
    Span[] spansWithRealOffsets = SimpleTokenizer.INSTANCE.tokenizePos(content);

    List<Span> spansList = new ArrayList<>();
    extractedEntities.stream()
            .map(ExtractedEntities::getSpans)
            .forEach(s -> spansList.addAll(Arrays.asList(s)));

    Span[] spans = NameFinderME.dropOverlappingSpans(spansList.toArray(new Span[0]));
    String[] tokens = extractedEntities.get(0).getTokens();

    // shortcut if there is no enrichment to be done
    if (spans.length == 0) {
        return content;
    }

    StringBuilder builder = new StringBuilder();
    for (int i = 0; i < tokens.length; i++) {
        final int idx = i;
        String token = tokens[i];

        final Optional<Span> optionalSpan = Arrays.stream(spans).filter(s -> s.getStart() == idx).findFirst();
        if (optionalSpan.isPresent()) {
            Span span = optionalSpan.get();
            int start = span.getStart();
            int end = span.getEnd();
            String type = span.getType();

            String[] spanTokens = new String[end - start];
            int spanPosition = 0;
            for (int tokenPosition = start ; tokenPosition < end; tokenPosition++) {
                spanTokens[spanPosition++] = tokens[tokenPosition];
            }
            String entityString = Strings.arrayToDelimitedString(spanTokens, " ");

            builder.append("[");
            builder.append(entityString);
            builder.append("](");
            builder.append(Strings.capitalize(type));
            builder.append("_");
            builder.append(entityString);
            builder.append(")");
            i = end - 1;
        } else {
            builder.append(token);
        }

        // only append a whitespace, if the offsets actually differ
        if (i < tokens.length - 1) {
            if (spansWithRealOffsets[i].getEnd() != spansWithRealOffsets[i+1].getStart()) {
                builder.append(" ");
            }
        }
    }

    return builder.toString();
}

Source File: TokenizerUnitTest.java From tutorials with MIT License

4 votes

@Test
public void givenSimpleTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
    SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
    assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}

opennlp.tools.tokenize.SimpleTokenizer Java Examples