opennlp.tools.tokenize.TokenizerModel Java Exaples

Source File: TokenSegmenter.java From dexter with Apache License 2.0

6 votes

public TokenSegmenter() {
	InputStream modelIn = null;
	try {
		// Loading tokenizer model
		modelIn = getClass().getResourceAsStream("/nlp/en-token.bin");
		final TokenizerModel tokenModel = new TokenizerModel(modelIn);
		modelIn.close();

		tokenizer = new TokenizerME(tokenModel);

	} catch (final IOException ioe) {
		ioe.printStackTrace();
	} finally {
		if (modelIn != null) {
			try {
				modelIn.close();
			} catch (final IOException e) {
			} // oh well!
		}
	}
}

Source File: ConcurrentTokenizer.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Initializes the current instance with the given context.
 *
 * Note: Do all initialization in this method, do not use the constructor.
 */
public void initialize(UimaContext context) throws ResourceInitializationException {

    super.initialize(context);

    TokenizerModel model;

    try {
        TokenizerModelResource modelResource =
                        (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);

        model = modelResource.getModel();
    } catch (ResourceAccessException e) {
        throw new ResourceInitializationException(e);
    }

    tokenizer = new TokenizerME(model);
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

6 votes

private static void trainingATokenizer() {
    createOpenNLPModel();
    try {
        paragraph = "A demonstration of how to train a tokenizer.";
        InputStream modelInputStream = new FileInputStream(new File(
                ".", "mymodel.bin"));
        TokenizerModel model = new TokenizerModel(modelInputStream);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(paragraph);
        for (String token : tokens) {
            System.out.println(token);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: ConcurrentTokenizer.java From deeplearning4j with Apache License 2.0

6 votes

/**
 * Initializes the current instance with the given context.
 * 
 * Note: Do all initialization in this method, do not use the constructor.
 */
public void initialize(UimaContext context) throws ResourceInitializationException {

    super.initialize(context);

    TokenizerModel model;

    try {
        TokenizerModelResource modelResource =
                        (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);

        model = modelResource.getModel();
    } catch (ResourceAccessException e) {
        throw new ResourceInitializationException(e);
    }

    tokenizer = new TokenizerME(model);
}

Source File: ConcurrentTokenizer.java From Canova with Apache License 2.0

6 votes

/**
 * Initializes the current instance with the given context.
 * 
 * Note: Do all initialization in this method, do not use the constructor.
 */
public void initialize(UimaContext context)
    throws ResourceInitializationException {

  super.initialize(context);

  TokenizerModel model;

  try {
    TokenizerModelResource modelResource = (TokenizerModelResource) context
        .getResourceObject(UimaUtil.MODEL_PARAMETER);

    model = modelResource.getModel();
  } catch (ResourceAccessException e) {
    throw new ResourceInitializationException(e);
  }

  tokenizer = new TokenizerME(model);
}

Source File: ConcurrentTokenizer.java From DataVec with Apache License 2.0

6 votes

/**
 * Initializes the current instance with the given context.
 * 
 * Note: Do all initialization in this method, do not use the constructor.
 */
public void initialize(UimaContext context) throws ResourceInitializationException {

    super.initialize(context);

    TokenizerModel model;

    try {
        TokenizerModelResource modelResource =
                        (TokenizerModelResource) context.getResourceObject(UimaUtil.MODEL_PARAMETER);

        model = modelResource.getModel();
    } catch (ResourceAccessException e) {
        throw new ResourceInitializationException(e);
    }

    tokenizer = new TokenizerME(model);
}

Source File: TokenizeME.java From datafu with Apache License 2.0

6 votes

public DataBag exec(Tuple input) throws IOException
{
    if(input.size() != 1) {
        throw new IOException();
    }

    String inputString = input.get(0).toString();
    if(inputString == null || inputString == "") {
        return null;
    }
    DataBag outBag = bf.newDefaultBag();
    if(this.tokenizer == null) {
        String loadFile = CachedFile.getFileName(MODEL_FILE, this.modelPath);;
        InputStream file = new FileInputStream(loadFile);
        InputStream buffer = new BufferedInputStream(file);
        TokenizerModel model = new TokenizerModel(buffer);
        this.tokenizer = new TokenizerME(model);
    }
    String tokens[] = this.tokenizer.tokenize(inputString);
    for(String token : tokens) {
        Tuple outTuple = tf.newTuple(token);
        outBag.add(outTuple);
    }
    return outBag;
}

Source File: OpenNLP.java From baleen with Apache License 2.0

6 votes

@Override
public void doInitialize(UimaContext aContext) throws ResourceInitializationException {
  try {
    tokensModel.loadModel(TokenizerModel.class, getClass().getResourceAsStream("en_token.bin"));
    sentencesModel.loadModel(SentenceModel.class, getClass().getResourceAsStream("en_sent.bin"));
    posModel.loadModel(POSModel.class, getClass().getResourceAsStream("en_pos_maxent.bin"));
    chunkModel.loadModel(ChunkerModel.class, getClass().getResourceAsStream("en_chunker.bin"));
  } catch (BaleenException be) {
    getMonitor().error("Unable to load OpenNLP Language Models", be);
    throw new ResourceInitializationException(be);
  }

  try {
    sentenceDetector = new SentenceDetectorME((SentenceModel) sentencesModel.getModel());
    wordTokenizer = new TokenizerME((TokenizerModel) tokensModel.getModel());
    posTagger = new POSTaggerME((POSModel) posModel.getModel());
    phraseChunker = new ChunkerME((ChunkerModel) chunkModel.getModel());
  } catch (Exception e) {
    getMonitor().error("Unable to create OpenNLP taggers", e);
    throw new ResourceInitializationException(e);
  }
}

Source File: SharedOpenNLPModelTest.java From baleen with Apache License 2.0

6 votes

@Test
public void testLoad() throws Exception {
  SharedOpenNLPModel m = new SharedOpenNLPModel();

  m.loadModel(TokenizerModel.class, OpenNLP.class.getResourceAsStream("en_token.bin"));

  BaseModel bm = m.getModel();
  assertNotNull(bm);
  assertTrue(bm instanceof TokenizerModel);
  assertEquals("en", bm.getLanguage());

  // Trying to load a different model shouldn't change the resource
  m.loadModel(SentenceModel.class, OpenNLP.class.getResourceAsStream("en_sent.bin"));
  assertEquals(bm, m.getModel());

  m.doDestroy();
}

Source File: OpenNLPTokenizerFactory.java From jate with GNU Lesser General Public License v3.0

6 votes

@Override
public void inform(ResourceLoader loader) throws IOException {
    if(sentenceModelFile!=null) {
        sentenceOp = new SentenceDetectorME(new SentenceModel(
                loader.openResource(sentenceModelFile)));
    }

    if(tokenizerModelFile==null)
        throw new IOException("Parameter 'tokenizerModle' is required, but is invalid:"+tokenizerModelFile);
    tokenizerOp = new TokenizerME(new TokenizerModel(
            loader.openResource(tokenizerModelFile)
    ));

    if(parChunkingClass!=null) {
        try {
            Class c = Class.forName(parChunkingClass);
            Object o = c.newInstance();
            paragraphChunker = (ParagraphChunker) o;
        }catch (Exception e){
            throw new IOException(e);
        }
    }

}

Source File: JM_Scorer.java From uncc2014watsonsim with GNU General Public License v2.0

6 votes

public double scoreStructure(String ca, String q, String passage, boolean verbose) throws InvalidFormatException, IOException{
	POSTaggerME parserModel = new POSTaggerME(new POSModel(new FileInputStream(new File("en-pos-model.bin"))));
	Tokenizer tokenizer = new TokenizerME(new TokenizerModel(new FileInputStream(new File("en-token.bin"))));
	Parser parser = ParserFactory.create(new ParserModel(new FileInputStream(new File("en-parser.bin"))));
	double score = 0;
	
	Parse[] questionParse = ParserTool.parseLine(q, parser, 1);
	Parse[] passageParse = ParserTool.parseLine(q, parser, 1);
	
	if (passage.contains(ca)) {
		for (int i =0; i < questionParse.length; i++) {
			score += matchChildren(questionParse[i],passageParse[i]);
		}
	}
	
	return score;
}

Source File: OpenNLPToolsTokenizerWrapper.java From mateplus with GNU General Public License v2.0

5 votes

public static OpenNLPToolsTokenizerWrapper loadOpenNLPTokenizer(
		File modelFile) throws IOException {
	BufferedInputStream modelIn = new BufferedInputStream(
			new FileInputStream(modelFile.toString()));
	opennlp.tools.tokenize.Tokenizer tokenizer = new TokenizerME(
			new TokenizerModel(modelIn));
	return new OpenNLPToolsTokenizerWrapper(tokenizer);
}

Source File: TokenizerUnitTest.java From tutorials with MIT License

5 votes

@Test
public void givenEnglishModel_whenTokenize_thenTokensAreDetected() throws Exception {
    InputStream inputStream = getClass().getResourceAsStream("/models/en-token.bin");
    TokenizerModel model = new TokenizerModel(inputStream);
    TokenizerME tokenizer = new TokenizerME(model);
    String[] tokens = tokenizer.tokenize("Baeldung is a Spring Resource.");
    assertThat(tokens).contains("Baeldung", "is", "a", "Spring", "Resource", ".");
}

Source File: ValueParser.java From TableDisentangler with GNU General Public License v3.0

5 votes

public ValueParser() {
	try {
		InputStream is = new FileInputStream("en-token.bin");

		TokenizerModel model = new TokenizerModel(is);

		tokenizer = new TokenizerME(model);
	} catch (Exception ex) {
		ex.printStackTrace();
	}
}

Source File: OpenNlpModule.java From SciGraph with Apache License 2.0

5 votes

@CheckedProvides(TokenizerProvider.class)
Tokenizer getTokenizer() throws IOException {
  try (InputStream is = getClass().getResourceAsStream("/opennlp/en-token.bin")) {
    TokenizerModel model = new TokenizerModel(is);
    return new TokenizerME(model);
  }
}

Source File: BasicActions.java From knowledge-extraction with Apache License 2.0

5 votes

public String[] testTokenizer(){
	String[] tokens = {};
	try (InputStream modelIn = BasicActions.class.getClassLoader()
			.getResourceAsStream(Consts.EN_TOKEN_MODEL);) {
		
		TokenizerModel tokenModel = new TokenizerModel(modelIn);
		Tokenizer tokenizer = new TokenizerME(tokenModel);
		tokens = tokenizer.tokenize(TEST_PHRASE);
		System.out.println(Arrays.toString(tokens));
	} catch (IOException e) {
		e.printStackTrace();
	}
	return tokens;
}

Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingMultipleNERModels() {
    // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
    // en-ner-organization.bin en-ner-time.bin
    try {
        InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));

        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);

        String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
            "en-ner-organization.bin"};
        ArrayList<String> list = new ArrayList();
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(
                    new FileInputStream(
                            new File(getModelDir(), name)));
            NameFinderME nameFinder = new NameFinderME(entityModel);
            for (int index = 0; index < sentences.length; index++) {
                String tokens[] = tokenizer.tokenize(sentences[index]);
                Span nameSpans[] = nameFinder.find(tokens);
                for (Span span : nameSpans) {
                    list.add("Sentence: " + index
                            + " Span: " + span.toString() + " Entity: "
                            + tokens[span.getStart()]);
                }
            }
        }
        System.out.println("Multiple Entities");
        for (String element : list) {
            System.out.println(element);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

Source File: OpenNLPOpsFactory.java From lucene-solr with Apache License 2.0

5 votes

public static NLPTokenizerOp getTokenizer(String modelName) throws IOException {
  if (modelName == null) {
    return new NLPTokenizerOp();
  } else {
    TokenizerModel model = tokenizerModels.get(modelName);
    return new NLPTokenizerOp(model);
  }
}

Source File: OpenNLPAnnotator.java From Stargraph with MIT License

5 votes

private TokenizerModel readTokenizerModel(Language language) {
    logger.debug(marker, "Reading tokenizer model for {}", language);
    File modelFile = new File(modelsDir, String.format("%s-token.bin", language.name().toLowerCase()));
    try (InputStream in = new FileInputStream(modelFile)) {
        return new TokenizerModel(in);
    } catch (IOException e) {
        throw new StarGraphException("Can't read '" + modelFile + "'", e);
    }
}

Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingTheTokenizerMEClass() {
    try {
        InputStream modelIn = new FileInputStream(new File(
                getModelDir(), "en-token.bin"));
        TokenizerModel model = new TokenizerModel(modelIn);
        Tokenizer tokenizer = new TokenizerME(model);
        String tokens[] = tokenizer.tokenize(paragraph);
        for (String token : tokens) {
            System.out.println(token);
        }
    } catch (IOException ex) {
        ex.printStackTrace();
    }
}

Source File: Chapter4.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

5 votes

private static void usingMultipleNERModels() {
    // Models - en-ner-person.bin en-ner-location.bin en-ner-money.bin 
    // en-ner-organization.bin en-ner-time.bin
    try {
        InputStream tokenStream = new FileInputStream(
                new File(getModelDir(), "en-token.bin"));

        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);

        String modelNames[] = {"en-ner-person.bin", "en-ner-location.bin",
            "en-ner-organization.bin"};
        ArrayList<String> list = new ArrayList();
        for (String name : modelNames) {
            TokenNameFinderModel entityModel = new TokenNameFinderModel(
                    new FileInputStream(
                            new File(getModelDir(), name)));
            NameFinderME nameFinder = new NameFinderME(entityModel);
            for (int index = 0; index < sentences.length; index++) {
                String tokens[] = tokenizer.tokenize(sentences[index]);
                Span nameSpans[] = nameFinder.find(tokens);
                for (Span span : nameSpans) {
                    list.add("Sentence: " + index
                            + " Span: " + span.toString() + " Entity: "
                            + tokens[span.getStart()]);
                }
            }
        }
        System.out.println("Multiple Entities");
        for (String element : list) {
            System.out.println(element);
        }
    } catch (Exception ex) {
        ex.printStackTrace();
    }
}

Source File: NLPTokenizerOp.java From lucene-solr with Apache License 2.0

4 votes

public NLPTokenizerOp(TokenizerModel model) {
  tokenizer = new TokenizerME(model);
}

Source File: OpenNLPAnnotator.java From Stargraph with MIT License

4 votes

private TokenizerModel getTokenizerModel(Language language) {
    return tokenizerModels.computeIfAbsent(language, this::readTokenizerModel);
}

Source File: NERDemo.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License

4 votes

public static void main(String args[]){
    String sentences[] = {"Joe was the last person to see Fred. ", 
        "He saw him in Boston at McKenzie's pub at 3:00 where he " 
        + " paid $2.45 for an ale. ", 
        "Joe wanted to go to Vermont for the day to visit a cousin who " 
        + "works at IBM, but Sally and he had to look for Fred"}; 
    String sentence = "He was the last person to see Fred."; 
    try
    {
        InputStream tokenStream = new FileInputStream(new File(getResourcePath()+ "en-token.bin"));
        InputStream modelStream = new FileInputStream(new File(getResourcePath() + "en-ner-person.bin"));
        TokenizerModel tokenModel = new TokenizerModel(tokenStream);
        Tokenizer tokenizer = new TokenizerME(tokenModel);
        TokenNameFinderModel entityModel = new TokenNameFinderModel(modelStream);
        NameFinderME nameFinder = new NameFinderME(entityModel);
        String tokens1[] = tokenizer.tokenize(sentence);
        Span nameSpans1[] = nameFinder.find(tokens1);
        for (int i = 0; i < nameSpans1.length; i++) { 
            System.out.println("Span: " + nameSpans1[i].toString()); 
            System.out.println("Entity: " 
                + tokens1[nameSpans1[i].getStart()]); 
        } 
        
        System.out.println("---------- Multiple Sentences -----------");
        for (String sentence1 : sentences) { 
            String tokens[] = tokenizer.tokenize(sentence1); 
            Span nameSpans[] = nameFinder.find(tokens); 
            for (int i = 0; i < nameSpans.length; i++) { 
                System.out.println("Span: " + nameSpans[i].toString()); 
                System.out.println("Entity: "  
                    + tokens[nameSpans[i].getStart()]); 
            } 
            System.out.println(); 
        } 
        
    }
    catch(Exception e){
        System.out.println(e);
    }
}

opennlp.tools.tokenize.TokenizerModel Java Examples