opennlp.tools.tokenize.WhitespaceTokenizer Java Exaples

Source File: Chapter1.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void detectingPartsOfSpeechExample() {
    String sentence = "POS processing is useful for enhancing the "
            + "quality of data sent to other elements of a pipeline.";

    POSModel model = new POSModelLoader()
            .load(new File("C:/Current Books/NLP and Java/Models/", "en-pos-maxent.bin"));
    POSTaggerME tagger = new POSTaggerME(model);

    String tokens[] = WhitespaceTokenizer.INSTANCE
            .tokenize(sentence);
    String[] tags = tagger.tag(tokens);

    POSSample sample = new POSSample(tokens, tags);
    String posTokens[] = sample.getSentence();
    String posTags[] = sample.getTags();
    for (int i = 0; i < posTokens.length; i++) {
        System.out.print(posTokens[i] + " - " + posTags[i]);
    }
    System.out.println();

    for (int i = 0; i < tokens.length; i++) {
        System.out.print(tokens[i] + "[" + tags[i] + "] ");
    }
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void usingTheOpenNLPLemmatizer() {
    // dictionary files downloaded from: https://code.google.com/p/xssm/downloads/detail?name=SimilarityUtils.zip&can=2&q=

    System.out.println("Starting the OpenNLP Lemmatizer");
    try {
        JWNLDictionary dictionary = new JWNLDictionary(
                "C:\\Downloads\\xssm\\SimilarityUtils\\WordNet-2.0\\dict\\");
        paragraph = "Eat, drink, and be merry, for life is but a dream";
        String tokens[] = WhitespaceTokenizer.INSTANCE.tokenize(paragraph);
        for (String token : tokens) {
            String[] lemmas = dictionary.getLemmas(token, "");
            for (String lemma : lemmas) {
                System.out.println("Token: " + token + "  Lemma: " + lemma);
            }
        }
    } catch (IOException | JWNLException ex) {
        Logger.getLogger(Chapter2.class.getName()).log(Level.SEVERE, null, ex);
    }
}

Source File: NGramTest.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

public static void main(String args[]){
    String sampletext = "This is n-gram model";
    System.out.println(sampletext);
    
    StringList tokens = new StringList(WhitespaceTokenizer.INSTANCE.tokenize(sampletext));
    System.out.println("Tokens " + tokens);
    
    NGramModel nGramModel = new NGramModel();
    nGramModel.add(tokens,2,4); // minlength and maxlength
    
    System.out.println("Total ngrams: " + nGramModel.numberOfGrams());
    for (StringList ngram : nGramModel) {
        System.out.println(nGramModel.getCount(ngram) + " - " + ngram);
    }
}

Source File: StopWords.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

public String removeStopWords(String words) {
        String arr[] = WhitespaceTokenizer.INSTANCE.tokenize(words);
        StringBuilder sb = new StringBuilder();
//        ArrayList<String> tokens = new ArrayList<String>(Arrays.asList(arr));
        for (int i = 0; i < arr.length; i++) {
//            System.out.println(tokens.get(i) + "-");
            if (stopWords.contains(arr[i])) {
//                tokens.remove(i);
//                System.out.println("Removing: [" + arr[i] + "]");
            } else {
                sb.append(arr[i]+" ");
            }
        }
        return sb.toString();
    }

Source File: Chapter5.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static String[] tokenizeSentence(String sentence) {
    //First techniquue
    String words[] = sentence.split("S+");

    // Second technique
    words = WhitespaceTokenizer.INSTANCE.tokenize(sentence);

    return words;
}

Source File: SearchText.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

public String removeStopWords(String words) {
    String arr[] = WhitespaceTokenizer.INSTANCE.tokenize(words);
    StringBuilder sb = new StringBuilder();
    for (int i = 0; i < arr.length; i++) {
        if (words.contains(arr[i])) {
            // Do nothing
        } else {
            sb.append(arr[i]+" ");
        }
    }
    return sb.toString();
}

Source File: TaxiInkBotConfiguration.java From Mutters with Apache License 2.0

5 votes

@Override
public IntentMatcher getIntentMatcher()
{
  // model was built with OpenNLP whitespace tokenizer
  OpenNLPTokenizer tokenizer = new OpenNLPTokenizer(WhitespaceTokenizer.INSTANCE);

  // use Open NLP NER for slot matching
  OpenNLPSlotMatcher slotMatcher = new OpenNLPSlotMatcher(tokenizer);
  slotMatcher.addSlotModel("Address", "models/en-ner-address.bin");

  // create intent matcher
  OpenNLPIntentMatcher matcher = new OpenNLPIntentMatcher("models/en-cat-taxi-intents.bin", tokenizer, slotMatcher);

  Intent intent = new Intent("OrderTaxi");
  intent.addSlot(new LiteralSlot("Address"));
  matcher.addIntent(intent);

  intent = new Intent("CancelTaxi");
  matcher.addIntent(intent);

  intent = new Intent("WhereTaxi");
  matcher.addIntent(intent);

  intent = new Intent("GaveAddress");
  intent.addSlot(new LiteralSlot("Address"));
  matcher.addIntent(intent);

  intent = new Intent("Stop");
  matcher.addIntent(intent);

  intent = new Intent("Help");
  matcher.addIntent(intent);

  intent = new Intent("FavColor");
  matcher.addIntent(intent);

  return matcher;
}

Source File: TaxiStateMachineBotConfiguration.java From Mutters with Apache License 2.0

5 votes

@Override
public IntentMatcher getIntentMatcher()
{
  // model was built with OpenNLP whitespace tokenizer
  OpenNLPTokenizer tokenizer = new OpenNLPTokenizer(WhitespaceTokenizer.INSTANCE);

  // use Open NLP NER for slot matching
  OpenNLPSlotMatcher slotMatcher = new OpenNLPSlotMatcher(tokenizer);
  slotMatcher.addSlotModel("Address", "models/en-ner-address.bin");

  // create intent matcher
  OpenNLPIntentMatcher matcher = new OpenNLPIntentMatcher("models/en-cat-taxi-intents.bin", tokenizer, slotMatcher);

  Intent intent = new Intent("OrderTaxi");
  intent.addSlot(new LiteralSlot("Address"));
  matcher.addIntent(intent);

  intent = new Intent("CancelTaxi");
  matcher.addIntent(intent);

  intent = new Intent("WhereTaxi");
  matcher.addIntent(intent);

  intent = new Intent("GaveAddress");
  intent.addSlot(new LiteralSlot("Address"));
  matcher.addIntent(intent);

  return matcher;
}

Source File: TestCategorization.java From Mutters with Apache License 2.0

5 votes

@Test
public void testCategorization() throws Exception
{
  URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-cat-taxi-intents.bin");
  assertThat(modelUrl, is(notNullValue()));

  DoccatModel model = new DoccatModel(modelUrl);
  assertThat(model, is(notNullValue()));

  DocumentCategorizerME myCategorizer = new DocumentCategorizerME(model);
  // model was built with OpenNLP whitespace tokenizer
  OpenNLPTokenizer tokenizer = new OpenNLPTokenizer(WhitespaceTokenizer.INSTANCE);

  String category = myCategorizer.getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Order me a taxi")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("OrderTaxi"));

  category = myCategorizer.getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Send me a taxi")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("OrderTaxi"));

  category = myCategorizer
      .getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Send a taxi to 12 Pleasent Street")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("OrderTaxi"));

  category = myCategorizer.getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Cancel my cab")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("CancelTaxi"));

  category = myCategorizer.getBestCategory(myCategorizer.categorize(tokenizer.tokenize("Where is my taxi ?")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("WhereTaxi"));

  category = myCategorizer
      .getBestCategory(myCategorizer.categorize(tokenizer.tokenize("The address is 136 River Road")));
  assertThat(category, is(notNullValue()));
  assertThat(category, is("GaveAddress"));
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

private static void usingTheWhitespaceTokenizer() {
    String tokens[] = WhitespaceTokenizer.INSTANCE.tokenize(paragraph);
    for (String token : tokens) {
        System.out.println(token);
    }
}

Source File: TokenizerUnitTest.java From tutorials with MIT License

4 votes

@Test
public void givenWhitespaceTokenizer_whenTokenize_thenTokensAreDetected() throws Exception {
    WhitespaceTokenizer tokenizer = WhitespaceTokenizer.INSTANCE;
    String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
    assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource.");
}

opennlp.tools.tokenize.WhitespaceTokenizer Java Examples