opennlp.tools.tokenize.TokenizerME Java Examples
The following examples show how to use
opennlp.tools.tokenize.TokenizerME.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TokenSegmenter.java From dexter with Apache License 2.0 | 6 votes |
public TokenSegmenter() { InputStream modelIn = null; try { // Loading tokenizer model modelIn = getClass().getResourceAsStream("/nlp/en-token.bin"); final TokenizerModel tokenModel = new TokenizerModel(modelIn); modelIn.close(); tokenizer = new TokenizerME(tokenModel); } catch (final IOException ioe) { ioe.printStackTrace(); } finally { if (modelIn != null) { try { modelIn.close(); } catch (final IOException e) { } // oh well! } } }
Example #2
Source File: ConcurrentTokenizer.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Initializes the current instance with the given context. * * Note: Do all initialization in this method, do not use the constructor. */ public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); TokenizerModel model; try { TokenizerModelResource modelResource = (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER); model = modelResource.getModel(); } catch (ResourceAccessException e) { throw new ResourceInitializationException(e); } tokenizer = new TokenizerME(model); }
Example #3
Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 6 votes |
private static void trainingATokenizer() { createOpenNLPModel(); try { paragraph = "A demonstration of how to train a tokenizer."; InputStream modelInputStream = new FileInputStream(new File( ".", "mymodel.bin")); TokenizerModel model = new TokenizerModel(modelInputStream); Tokenizer tokenizer = new TokenizerME(model); String tokens[] = tokenizer.tokenize(paragraph); for (String token : tokens) { System.out.println(token); } } catch (IOException ex) { ex.printStackTrace(); } }
Example #4
Source File: OpenNLPAnnotator.java From Stargraph with MIT License | 6 votes |
@Override public List<Word> doRun(Language language, String sentence) { Tokenizer tokenizer = new TokenizerME(getTokenizerModel(language)); POSTaggerME tagger = new POSTaggerME(getPOSModel(language)); String[] tokens = tokenizer.tokenize(sentence); String[] tags = tagger.tag(tokens); PartOfSpeechSet posSet = PartOfSpeechSet.getPOSSet(language); List<Word> words = new ArrayList<>(); for (int i = 0; i < tokens.length; i++) { words.add(new Word(posSet.valueOf(tags[i]), tokens[i])); } return words; }
Example #5
Source File: ConcurrentTokenizer.java From DataVec with Apache License 2.0 | 6 votes |
/** * Initializes the current instance with the given context. * * Note: Do all initialization in this method, do not use the constructor. */ public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); TokenizerModel model; try { TokenizerModelResource modelResource = (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER); model = modelResource.getModel(); } catch (ResourceAccessException e) { throw new ResourceInitializationException(e); } tokenizer = new TokenizerME(model); }
Example #6
Source File: ConcurrentTokenizer.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Initializes the current instance with the given context. * * Note: Do all initialization in this method, do not use the constructor. */ public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); TokenizerModel model; try { TokenizerModelResource modelResource = (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER); model = modelResource.getModel(); } catch (ResourceAccessException e) { throw new ResourceInitializationException(e); } tokenizer = new TokenizerME(model); }
Example #7
Source File: ConcurrentTokenizer.java From Canova with Apache License 2.0 | 6 votes |
/** * Initializes the current instance with the given context. * * Note: Do all initialization in this method, do not use the constructor. */ public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); TokenizerModel model; try { TokenizerModelResource modelResource = (TokenizerModelResource) context .getResourceObject(UimaUtil.MODEL_PARAMETER); model = modelResource.getModel(); } catch (ResourceAccessException e) { throw new ResourceInitializationException(e); } tokenizer = new TokenizerME(model); }
Example #8
Source File: OpenNLP.java From baleen with Apache License 2.0 | 6 votes |
@Override public void doInitialize(UimaContext aContext) throws ResourceInitializationException { try { tokensModel.loadModel(TokenizerModel.class, getClass().getResourceAsStream("en_token.bin")); sentencesModel.loadModel(SentenceModel.class, getClass().getResourceAsStream("en_sent.bin")); posModel.loadModel(POSModel.class, getClass().getResourceAsStream("en_pos_maxent.bin")); chunkModel.loadModel(ChunkerModel.class, getClass().getResourceAsStream("en_chunker.bin")); } catch (BaleenException be) { getMonitor().error("Unable to load OpenNLP Language Models", be); throw new ResourceInitializationException(be); } try { sentenceDetector = new SentenceDetectorME((SentenceModel) sentencesModel.getModel()); wordTokenizer = new TokenizerME((TokenizerModel) tokensModel.getModel()); posTagger = new POSTaggerME((POSModel) posModel.getModel()); phraseChunker = new ChunkerME((ChunkerModel) chunkModel.getModel()); } catch (Exception e) { getMonitor().error("Unable to create OpenNLP taggers", e); throw new ResourceInitializationException(e); } }
Example #9
Source File: TokenizeME.java From datafu with Apache License 2.0 | 6 votes |
public DataBag exec(Tuple input) throws IOException { if(input.size() != 1) { throw new IOException(); } String inputString = input.get(0).toString(); if(inputString == null || inputString == "") { return null; } DataBag outBag = bf.newDefaultBag(); if(this.tokenizer == null) { String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);; InputStream file = new FileInputStream(loadFile); InputStream buffer = new BufferedInputStream(file); TokenizerModel model = new TokenizerModel(buffer); this.tokenizer = new TokenizerME(model); } String tokens[] = this.tokenizer.tokenize(inputString); for(String token : tokens) { Tuple outTuple = tf.newTuple(token); outBag.add(outTuple); } return outBag; }
Example #10
Source File: JM_Scorer.java From uncc2014watsonsim with GNU General Public License v2.0 | 6 votes |
public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{ POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin")))); Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin")))); Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin")))); double score = 0; Parse[] questionParse = ParserTool.parseLine(q, parser, 1); Parse[] passageParse = ParserTool.parseLine(q, parser, 1); if (passage.contains(ca)) { for (int i =0; i < questionParse.length; i++) { score += matchChildren(questionParse[i],passageParse[i]); } } return score; }
Example #11
Source File: OpenNLPTokenizerFactory.java From jate with GNU Lesser General Public License v3.0 | 6 votes |
@Override public void inform(ResourceLoader loader) throws IOException { if(sentenceModelFile!=null) { sentenceOp = new SentenceDetectorME(new SentenceModel( loader.openResource(sentenceModelFile))); } if(tokenizerModelFile==null) throw new IOException("Parameter 'tokenizerModle' is required, but is invalid:"+tokenizerModelFile); tokenizerOp = new TokenizerME(new TokenizerModel( loader.openResource(tokenizerModelFile) )); if(parChunkingClass!=null) { try { Class c = Class.forName(parChunkingClass); Object o = c.newInstance(); paragraphChunker = (ParagraphChunker) o; }catch (Exception e){ throw new IOException(e); } } }
Example #12
Source File: BasicActions.java From knowledge-extraction with Apache License 2.0 | 5 votes |
public String[] testTokenizer(){ String[] tokens = {}; try (InputStream modelIn = BasicActions.class.getClassLoader() .getResourceAsStream(Consts.EN_TOKEN_MODEL);) { TokenizerModel tokenModel = new TokenizerModel(modelIn); Tokenizer tokenizer = new TokenizerME(tokenModel); tokens = tokenizer.tokenize(TEST_PHRASE); System.out.println(Arrays.toString(tokens)); } catch (IOException e) { e.printStackTrace(); } return tokens; }
Example #13
Source File: TokenizerUnitTest.java From tutorials with MIT License | 5 votes |
@Test public void givenEnglishModel_whenTokenize_thenTokensAreDetected() throws Exception { InputStream inputStream = getClass().getResourceAsStream("/models/en-token.bin"); TokenizerModel model = new TokenizerModel(inputStream); TokenizerME tokenizer = new TokenizerME(model); String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource."); assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", "."); }
Example #14
Source File: ValueParser.java From TableDisentangler with GNU General Public License v3.0 | 5 votes |
public ValueParser() { try { InputStream is = new FileInputStream("en-token.bin"); TokenizerModel model = new TokenizerModel(is); tokenizer = new TokenizerME(model); } catch (Exception ex) { ex.printStackTrace(); } }
Example #15
Source File: OpenNlpModule.java From SciGraph with Apache License 2.0 | 5 votes |
@CheckedProvides(TokenizerProvider.class) Tokenizer getTokenizer() throws IOException { try (InputStream is = getClass().getResourceAsStream("/opennlp/en-token.bin")) { TokenizerModel model = new TokenizerModel(is); return new TokenizerME(model); } }
Example #16
Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static void usingMultipleNERModels() { // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin // en-ner-organization.bin en-ner-time.bin try { InputStream tokenStream = new FileInputStream( new File(getModelDir(), "en-token.bin")); TokenizerModel tokenModel = new TokenizerModel(tokenStream); Tokenizer tokenizer = new TokenizerME(tokenModel); String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin", "en-ner-organization.bin"}; ArrayList<String> list = new ArrayList(); for (String name : modelNames) { TokenNameFinderModel entityModel = new TokenNameFinderModel( new FileInputStream( new File(getModelDir(), name))); NameFinderME nameFinder = new NameFinderME(entityModel); for (int index = 0; index < sentences.length; index++) { String tokens[] = tokenizer.tokenize(sentences[index]); Span nameSpans[] = nameFinder.find(tokens); for (Span span : nameSpans) { list.add("Sentence: " + index + " Span: " + span.toString() + " Entity: " + tokens[span.getStart()]); } } } System.out.println("Multiple Entities"); for (String element : list) { System.out.println(element); } } catch (Exception ex) { ex.printStackTrace(); } }
Example #17
Source File: OpenNLPToolsTokenizerWrapper.java From mateplus with GNU General Public License v2.0 | 5 votes |
public static OpenNLPToolsTokenizerWrapper loadOpenNLPTokenizer( File modelFile) throws IOException { BufferedInputStream modelIn = new BufferedInputStream( new FileInputStream(modelFile.toString())); opennlp.tools.tokenize.Tokenizer tokenizer = new TokenizerME( new TokenizerModel(modelIn)); return new OpenNLPToolsTokenizerWrapper(tokenizer); }
Example #18
Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static void usingTheTokenizerMEClass() { try { InputStream modelIn = new FileInputStream(new File( getModelDir(), "en-token.bin")); TokenizerModel model = new TokenizerModel(modelIn); Tokenizer tokenizer = new TokenizerME(model); String tokens[] = tokenizer.tokenize(paragraph); for (String token : tokens) { System.out.println(token); } } catch (IOException ex) { ex.printStackTrace(); } }
Example #19
Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static void usingMultipleNERModels() { // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin // en-ner-organization.bin en-ner-time.bin try { InputStream tokenStream = new FileInputStream( new File(getModelDir(), "en-token.bin")); TokenizerModel tokenModel = new TokenizerModel(tokenStream); Tokenizer tokenizer = new TokenizerME(tokenModel); String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin", "en-ner-organization.bin"}; ArrayList<String> list = new ArrayList(); for (String name : modelNames) { TokenNameFinderModel entityModel = new TokenNameFinderModel( new FileInputStream( new File(getModelDir(), name))); NameFinderME nameFinder = new NameFinderME(entityModel); for (int index = 0; index < sentences.length; index++) { String tokens[] = tokenizer.tokenize(sentences[index]); Span nameSpans[] = nameFinder.find(tokens); for (Span span : nameSpans) { list.add("Sentence: " + index + " Span: " + span.toString() + " Entity: " + tokens[span.getStart()]); } } } System.out.println("Multiple Entities"); for (String element : list) { System.out.println(element); } } catch (Exception ex) { ex.printStackTrace(); } }
Example #20
Source File: OpenNLPTokenAnnotator.java From modernmt with Apache License 2.0 | 4 votes |
private OpenNLPTokenAnnotator(TokenizerME tokenizer) { this.tokenizer = tokenizer; }
Example #21
Source File: NLPTokenizerOp.java From lucene-solr with Apache License 2.0 | 4 votes |
public NLPTokenizerOp(TokenizerModel model) { tokenizer = new TokenizerME(model); }
Example #22
Source File: NERDemo.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 4 votes |
public static void main(String args[]){ String sentences[] = {"Joe was the last person to see Fred. ", "He saw him in Boston at McKenzie's pub at 3:00 where he " + " paid $2.45 for an ale. ", "Joe wanted to go to Vermont for the day to visit a cousin who " + "works at IBM, but Sally and he had to look for Fred"}; String sentence = "He was the last person to see Fred."; try { InputStream tokenStream = new FileInputStream(new File(getResourcePath()+ "en-token.bin")); InputStream modelStream = new FileInputStream(new File(getResourcePath() + "en-ner-person.bin")); TokenizerModel tokenModel = new TokenizerModel(tokenStream); Tokenizer tokenizer = new TokenizerME(tokenModel); TokenNameFinderModel entityModel = new TokenNameFinderModel(modelStream); NameFinderME nameFinder = new NameFinderME(entityModel); String tokens1[] = tokenizer.tokenize(sentence); Span nameSpans1[] = nameFinder.find(tokens1); for (int i = 0; i < nameSpans1.length; i++) { System.out.println("Span: " + nameSpans1[i].toString()); System.out.println("Entity: " + tokens1[nameSpans1[i].getStart()]); } System.out.println("---------- Multiple Sentences -----------"); for (String sentence1 : sentences) { String tokens[] = tokenizer.tokenize(sentence1); Span nameSpans[] = nameFinder.find(tokens); for (int i = 0; i < nameSpans.length; i++) { System.out.println("Span: " + nameSpans[i].toString()); System.out.println("Entity: " + tokens[nameSpans[i].getStart()]); } System.out.println(); } } catch(Exception e){ System.out.println(e); } }