Java Examples
The following examples show how to use
Example #1
Source File: From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 6 votes |
private static void nameFinderExample() { try { String[] sentences = { "Tim was a good neighbor. Perhaps not as good a Bob " + "Haywood, but still pretty good. Of course Mr. Adam " + "took the cake!"}; Tokenizer tokenizer = SimpleTokenizer.INSTANCE; TokenNameFinderModel model = new TokenNameFinderModel(new File( "C:\\OpenNLP Models", "en-ner-person.bin")); NameFinderME finder = new NameFinderME(model); for (String sentence : sentences) { // Split the sentence into tokens String[] tokens = tokenizer.tokenize(sentence); // Find the names in the tokens and return Span objects Span[] nameSpans = finder.find(tokens); // Print the names extracted from the tokens using the Span data System.out.println(Arrays.toString( Span.spansToStrings(nameSpans, tokens))); } } catch (IOException ex) { ex.printStackTrace(); } }
Example #2
Source File: From uncc2014watsonsim with GNU General Public License v2.0 | 6 votes |
public static Parse[] parsePassageText(String p) throws InvalidFormatException{ //initialize SentenceDetectorME sentenceDetector = new SentenceDetectorME(sentenceModel); Parser parser = ParserFactory.create( parserModel, 20, // beam size 0.95); // advance percentage String[] sentences = sentenceDetector.sentDetect(p); Parse[] results = new Parse[sentences.length]; for (int i=0;i<sentences.length;i++){ String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]); String sent= StringUtils.join(tks," "); System.out.println("Found sentence " + sent); Parse[] sentResults = ParserTool.parseLine(sent,parser, 1); results[i]=sentResults[0]; } return results; }
Example #3
Source File: From uncc2014watsonsim with GNU General Public License v2.0 | 6 votes |
public Parse[] parsePassageText(String p) throws InvalidFormatException{ if (!modelsAreInitialized)init(); //initialize SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel); Parser parser = ParserFactory.create( this.parserModel, 20, // beam size 0.95); // advance percentage //find sentences, tokenize each, parse each, return top parse for each String[] sentences = sentenceDetector.sentDetect(p); Parse[] results = new Parse[sentences.length]; for (int i=0;i<sentences.length;i++){ String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]); //StringTokenizer st = new StringTokenizer(tks[i]); //There are several tokenizers available. SimpleTokenizer works best String sent= StringUtils.join(tks," "); System.out.println("Found sentence " + sent); Parse[] sentResults = ParserTool.parseLine(sent,parser, 1); results[i]=sentResults[0]; } return results; }
Example #4
Source File: From baleen with Apache License 2.0 | 6 votes |
@Override public void doProcess(JCas jCas) throws AnalysisEngineProcessException { DocumentAnnotation da = getDocumentAnnotation(jCas); if (Strings.isNullOrEmpty(da.getDocType())) { double[] outcomes = doccat.categorize(SimpleTokenizer.INSTANCE.tokenize(jCas.getDocumentText())); String cat = doccat.getBestCategory(outcomes); double max = -Double.MAX_VALUE; for (double d : outcomes) { if (d > max) { max = d; } } if (threshold != null && max > threshold) { da.setDocType(cat); } } else { getMonitor() .warn("A DocType annotation already exists. A second annotation will not be added."); } }
Example #5
Source File: From Mutters with Apache License 2.0 | 6 votes |
@Test public void testAddressNER() throws Exception { URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-address.bin"); assertThat(modelUrl, is(notNullValue())); TokenNameFinderModel model = new TokenNameFinderModel(modelUrl); assertThat(model, is(notNullValue())); NameFinderME nameFinder = new NameFinderME(model); String[] tokens = SimpleTokenizer.INSTANCE.tokenize("Send a taxi to 12 Pleasent Street"); Span[] spans = nameFinder.find(tokens); assertThat(spans.length, is(1)); String[] locations = Span.spansToStrings(spans, tokens); assertThat(locations.length, is(1)); assertThat(locations[0], is("12 Pleasent Street")); }
Example #6
Source File: From Mutters with Apache License 2.0 | 6 votes |
@Test public void testDateNER() throws Exception { URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-dates.bin"); assertThat(modelUrl, is(notNullValue())); TokenNameFinderModel model = new TokenNameFinderModel(modelUrl); assertThat(model, is(notNullValue())); NameFinderME nameFinder = new NameFinderME(model); String[] tokens = SimpleTokenizer.INSTANCE .tokenize("Mr. John Smith of New York, married Anne Green of London today."); assertThat(tokens.length, is(15)); Span[] spans = nameFinder.find(tokens); assertThat(spans.length, is(1)); String[] locations = Span.spansToStrings(spans, tokens); assertThat(locations.length, is(1)); assertThat(locations[0], is("today")); }
Example #7
Source File: From Mutters with Apache License 2.0 | 6 votes |
@Test public void testLocationNER() throws Exception { URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-locations.bin"); assertThat(modelUrl, is(notNullValue())); TokenNameFinderModel model = new TokenNameFinderModel(modelUrl); assertThat(model, is(notNullValue())); NameFinderME nameFinder = new NameFinderME(model); String[] tokens = SimpleTokenizer.INSTANCE .tokenize("Mr. John Smith of New York, married Anne Green of London today."); assertThat(tokens.length, is(15)); Span[] spans = nameFinder.find(tokens); assertThat(spans.length, is(2)); String[] locations = Span.spansToStrings(spans, tokens); assertThat(locations.length, is(2)); assertThat(locations[0], is("New York")); assertThat(locations[1], is("London")); }
Example #8
Source File: From Mutters with Apache License 2.0 | 6 votes |
@Test public void testPersonNER() throws Exception { URL modelUrl = Thread.currentThread().getContextClassLoader().getResource("models/en-ner-persons.bin"); assertThat(modelUrl, is(notNullValue())); TokenNameFinderModel model = new TokenNameFinderModel(modelUrl); assertThat(model, is(notNullValue())); NameFinderME nameFinder = new NameFinderME(model); String[] tokens = SimpleTokenizer.INSTANCE .tokenize("Mr. John Smith of New York, married Anne Green of London today."); assertThat(tokens.length, is(15)); Span[] spans = nameFinder.find(tokens); assertThat(spans.length, is(2)); String[] names = Span.spansToStrings(spans, tokens); assertThat(names.length, is(2)); assertThat(names[0], is("John Smith")); assertThat(names[1], is("Anne Green")); }
Example #9
Source File: From elasticsearch-ingest-opennlp with Apache License 2.0 | 6 votes |
public ExtractedEntities find(String content, String field) { try { if (!nameFinderModels.containsKey(field)) { throw new ElasticsearchException("Could not find field [{}], possible values {}", field, nameFinderModels.keySet()); } TokenNameFinderModel finderModel = nameFinderModels.get(field); if (threadLocal.get() == null || !threadLocal.get().equals(finderModel)) { threadLocal.set(finderModel); } String[] tokens = SimpleTokenizer.INSTANCE.tokenize(content); Span[] spans = new NameFinderME(finderModel).find(tokens); return new ExtractedEntities(tokens, spans); } finally { threadLocal.remove(); } }
Example #10
Source File: From scava with Eclipse Public License 2.0 | 6 votes |
public OpenNlpTartarus() { logger = (OssmeterLogger) OssmeterLogger.getLogger(""); ClassLoader cl = getClass().getClassLoader(); try { posTaggerME = loadPoSME(cl, "models/en-pos-maxent.bin"); simpleTokenizer = SimpleTokenizer.INSTANCE; SentenceModel sentenceModel = loadSentenceModel(cl, "models/en-sent.bin"); sentenceDetector = new SentenceDetectorME(sentenceModel);"Models have been sucessfully loaded"); } catch (IOException e) { logger.error("Error while loading the model:", e); e.printStackTrace(); } // InputStream tokenizerModelInput = loadModelInput("models/en-token.bin"); // TokenizerModel tokenizerModel = loadTokenizerModel(tokenizerModelInput); // tokenizerME = new TokenizerME(tokenizerModel); stemmer = new englishStemmer(); }
Example #11
Source File: From tutorials with MIT License | 6 votes |
@Test public void givenEnglishDictionary_whenLemmatize_thenLemmasAreDetected() throws Exception { SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] tokens = tokenizer.tokenize("John has a sister named Penny."); InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin"); POSModel posModel = new POSModel(inputStreamPOSTagger); POSTaggerME posTagger = new POSTaggerME(posModel); String tags[] = posTagger.tag(tokens); InputStream dictLemmatizer = getClass().getResourceAsStream("/models/en-lemmatizer.dict"); DictionaryLemmatizer lemmatizer = new DictionaryLemmatizer(dictLemmatizer); String[] lemmas = lemmatizer.lemmatize(tokens, tags); assertThat(lemmas).contains("O", "have", "a", "sister", "name", "O", "O"); }
Example #12
Source File: From tutorials with MIT License | 6 votes |
@Test public void givenEnglishPersonModel_whenNER_thenPersonsAreDetected() throws Exception { SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] tokens = tokenizer.tokenize("John is 26 years old. His best friend's name is Leonard. He has a sister named Penny."); InputStream inputStreamNameFinder = getClass().getResourceAsStream("/models/en-ner-person.bin"); TokenNameFinderModel model = new TokenNameFinderModel(inputStreamNameFinder); NameFinderME nameFinderME = new NameFinderME(model); List<Span> spans = Arrays.asList(nameFinderME.find(tokens)); assertThat(spans.toString()).isEqualTo("[[0..1) person, [13..14) person, [20..21) person]"); List<String> names = new ArrayList<String>(); int k = 0; for (Span s : spans) { names.add(""); for (int index = s.getStart(); index < s.getEnd(); index++) { names.set(k, names.get(k) + tokens[index]); } k++; } assertThat(names).contains("John","Leonard","Penny"); }
Example #13
Source File: From tutorials with MIT License | 6 votes |
@Test public void givenChunkerModel_whenChunk_thenChunksAreDetected() throws Exception { SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] tokens = tokenizer.tokenize("He reckons the current account deficit will narrow to only 8 billion."); InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin"); POSModel posModel = new POSModel(inputStreamPOSTagger); POSTaggerME posTagger = new POSTaggerME(posModel); String tags[] = posTagger.tag(tokens); InputStream inputStreamChunker = new FileInputStream("src/main/resources/models/en-chunker.bin"); ChunkerModel chunkerModel = new ChunkerModel(inputStreamChunker); ChunkerME chunker = new ChunkerME(chunkerModel); String[] chunks = chunker.chunk(tokens, tags); assertThat(chunks).contains("B-NP", "B-VP", "B-NP", "I-NP", "I-NP", "I-NP", "B-VP", "I-VP", "B-PP", "B-NP", "I-NP", "I-NP", "O"); }
Example #14
Source File: From uncc2014watsonsim with GNU General Public License v2.0 | 5 votes |
public void taggerTest(){ String[] words = SimpleTokenizer.INSTANCE.tokenize( "The quick, red fox jumped over the lazy, brown dogs."); String[] result = tagger.tag(words); for (int i=0 ; i < words.length; i++) { System.err.print(words[i] + "/" + result[i] + " "); } System.err.println("n"); }
Example #15
Source File: From tutorials with MIT License | 5 votes |
@Test public void givenPOSModel_whenPOSTagging_thenPOSAreDetected() throws Exception { SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] tokens = tokenizer.tokenize("John has a sister named Penny."); InputStream inputStreamPOSTagger = getClass().getResourceAsStream("/models/en-pos-maxent.bin"); POSModel posModel = new POSModel(inputStreamPOSTagger); POSTaggerME posTagger = new POSTaggerME(posModel); String tags[] = posTagger.tag(tokens); assertThat(tags).contains("NNP", "VBZ", "DT", "NN", "VBN", "NNP", "."); }
Example #16
Source File: From uncc2014watsonsim with GNU General Public License v2.0 | 5 votes |
private static int[] POSScoreSentece(String sampleQACombined) { // TODO Auto-generated method stub int[] scorerModel = { 0, 0, 0, 0, 0, 0 }; String[] words = SimpleTokenizer.INSTANCE.tokenize(sampleQACombined); String[] result = tagger.tag(words); for (int i = 0; i < result.length; i++) { System.out.println(result[i]); } for (int i=0 ; i < words.length; i++) { if(result[i].equals("CD")){ scorerModel[0]++; }else if(result[i].equals("EX")){ scorerModel[1]++; }else if(result[i].equals("JJ") || result[i].equals("JJR") || result[i].equals("JJS")){ scorerModel[2]++; }else if(result[i].equals("NN") || result[i].equals("NNS") || result[i].equals("NNP") || result[i].equals("NNPS")){ scorerModel[3]++; }else if(result[i].equals("RB") || result[i].equals("RBR") || result[i].equals("RBS")){ scorerModel[4]++; }else if(result[i].equals("VB") || result[i].equals("VBD") || result[i].equals("VBG") || result[i].equals("VBN") || result[i].equals("VBP") || result[i].equals("VBZ")){ scorerModel[5]++; } } return scorerModel; }
Example #17
Source File: From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static void usingTheSimpleTokenizerClass() { System.out.println("--- SimpleTokenizer"); SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE; String tokens[] = simpleTokenizer.tokenize(paragraph); for (String token : tokens) { System.out.println(token); } }
Example #18
Source File: From uncc2014watsonsim with GNU General Public License v2.0 | 5 votes |
public void taggerTest(){ String[] words = SimpleTokenizer.INSTANCE.tokenize( "The quick, red fox jumped over the lazy, brown dogs."); String[] result = tagger.tag(words); for (int i=0 ; i < words.length; i++) { System.err.print(words[i] + "/" + result[i] + " "); } System.err.println("n"); }
Example #19
Source File: From uncc2014watsonsim with GNU General Public License v2.0 | 5 votes |
public Parse[] parsePassageText(String p) throws InvalidFormatException{ if (!modelsAreInitialized)init(); //initialize SentenceDetectorME sentenceDetector = new SentenceDetectorME(this.sentenceModel); NameFinderME nameFinder = new NameFinderME(this.nerModel); Parser parser = ParserFactory.create( this.parserModel, 20, // beam size 0.95); // advance percentage //find sentences, tokenize each, parse each, return top parse for each String[] sentences = sentenceDetector.sentDetect(p); Parse[] results = new Parse[sentences.length]; for (int i=0;i<sentences.length;i++){ //String[] tks = SimpleTokenizer.INSTANCE.tokenize(sentences[i]); //StringTokenizer st = new StringTokenizer(tks[i]); //There are several tokenizers available. SimpleTokenizer works best Tokenizer tokenizer = SimpleTokenizer.INSTANCE; for (int si = 0; si < sentences.length; si++) { Span[] tokenSpans = tokenizer.tokenizePos(sentences[si]); String[] tokens = Span.spansToStrings(tokenSpans, sentences[si]); Span[] names = nameFinder.find(tokens); for (int ni = 0; ni < names.length; ni++) { Span startSpan = tokenSpans[names[ni].getStart()]; int nameStart = startSpan.getStart(); Span endSpan = tokenSpans[names[ni].getEnd() - 1]; int nameEnd = endSpan.getEnd(); String name = sentences[si].substring(nameStart, nameEnd); System.out.println(name); } } String sent= StringUtils.join(tokenizer," "); System.out.println("Found sentence " + sent); Parse[] sentResults = ParserTool.parseLine(sent,parser, 1); results[i]=sentResults[0]; } return results; }
Example #20
Source File: From uncc2014watsonsim with GNU General Public License v2.0 | 5 votes |
/** Tokenize a paragraph into sentences, then into words. */ public List<List<String>> tokenizeParagraph(String paragraph) { List<List<String>> results = new ArrayList<>(); // Find sentences, tokenize each, parse each, return top parse for each for (String unsplit_sentence : sentenceDetector.sentDetect(paragraph)) { results.add(Arrays.asList( SimpleTokenizer.INSTANCE.tokenize(unsplit_sentence) )); } return results; }
Example #21
Source File: From Mutters with Apache License 2.0 | 5 votes |
@Test public void when_default_slot_does_not_match_return_default_value() { OpenNLPSlotMatcher slotMatcher = new OpenNLPSlotMatcher(new OpenNLPTokenizer(SimpleTokenizer.INSTANCE)); slotMatcher.addSlotModel("testSlot", "models/en-ner-persons.bin"); Intent testIntent = new Intent("testIntent"); TestSlot testSlot = new TestSlot("testSlot"); testIntent.addSlot(testSlot); Map<Slot<?>, SlotMatch<?>> slotSlotMatchHashMap = slotMatcher.match(new Context(), testIntent, "testUtterance"); SlotMatch<?> slotMatch = slotSlotMatchHashMap.get(testSlot); assertThat(slotMatch.getValue(), is("Default value")); }
Example #22
Source File: From wiseowl with MIT License | 5 votes |
public NameFilter(TokenStream in,String[] modelNames, NameFinderME[] finders) { super(in); this.tokenizer = SimpleTokenizer.INSTANCE; this.finders = finders; this.tokenTypeNames = new String[modelNames.length]; for (int i=0; i < modelNames.length; i++) { tokenTypeNames[i] = NE_PREFIX + modelNames[i]; } }
Example #23
Source File: From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static List<President> createPresidentList() { ArrayList<President> list = new ArrayList<>(); String line = null; try (FileReader reader = new FileReader("PresidentList"); BufferedReader br = new BufferedReader(reader)) { while ((line = br.readLine()) != null) { SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE; String tokens[] = simpleTokenizer.tokenize(line); String name = ""; String start = ""; String end = ""; int i = 0; while (!"(".equals(tokens[i])) { name += tokens[i] + " "; i++; } start = tokens[i + 1]; end = tokens[i + 3]; if (end.equalsIgnoreCase("present")) { end = start; } list.add(new President(name, Integer.parseInt(start), Integer.parseInt(end))); } } catch (IOException ex) { ex.printStackTrace(); } return list; }
Example #24
Source File: From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static void usingStopWordsClassExample() { StopWords stopWords = new StopWords("stopwords.txt"); SimpleTokenizer simpleTokenizer = SimpleTokenizer.INSTANCE; paragraph = "A simple approach is to create a class " + "to hold and remove stopwords."; String tokens[] = simpleTokenizer.tokenize(paragraph); String list[] = stopWords.removeStopWords(tokens); for (String word : list) { System.out.println(word); } stopWords.displayStopWords(); }
Example #25
Source File: From elasticsearch-ingest-opennlp with Apache License 2.0 | 4 votes |
static String createAnnotatedText(String content, List<ExtractedEntities> extractedEntities) { // these spans contain the real offset of each word in start/end variables! // the spans of the method argument contain the offset of each token, as mentioned in tokens! Span[] spansWithRealOffsets = SimpleTokenizer.INSTANCE.tokenizePos(content); List<Span> spansList = new ArrayList<>(); .map(ExtractedEntities::getSpans) .forEach(s -> spansList.addAll(Arrays.asList(s))); Span[] spans = NameFinderME.dropOverlappingSpans(spansList.toArray(new Span[0])); String[] tokens = extractedEntities.get(0).getTokens(); // shortcut if there is no enrichment to be done if (spans.length == 0) { return content; } StringBuilder builder = new StringBuilder(); for (int i = 0; i < tokens.length; i++) { final int idx = i; String token = tokens[i]; final Optional<Span> optionalSpan = -> s.getStart() == idx).findFirst(); if (optionalSpan.isPresent()) { Span span = optionalSpan.get(); int start = span.getStart(); int end = span.getEnd(); String type = span.getType(); String[] spanTokens = new String[end - start]; int spanPosition = 0; for (int tokenPosition = start ; tokenPosition < end; tokenPosition++) { spanTokens[spanPosition++] = tokens[tokenPosition]; } String entityString = Strings.arrayToDelimitedString(spanTokens, " "); builder.append("["); builder.append(entityString); builder.append("]("); builder.append(Strings.capitalize(type)); builder.append("_"); builder.append(entityString); builder.append(")"); i = end - 1; } else { builder.append(token); } // only append a whitespace, if the offsets actually differ if (i < tokens.length - 1) { if (spansWithRealOffsets[i].getEnd() != spansWithRealOffsets[i+1].getStart()) { builder.append(" "); } } } return builder.toString(); }
Example #26
Source File: From tutorials with MIT License | 4 votes |
@Test public void givenSimpleTokenizer_whenTokenize_thenTokensAreDetected() throws Exception { SimpleTokenizer tokenizer = SimpleTokenizer.INSTANCE; String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource."); assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", "."); }