com.aliasi.tokenizer.TokenizerFactory Java Examples

The following examples show how to use com.aliasi.tokenizer.TokenizerFactory. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TrainEntities.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 7 votes vote down vote up
public static void main(String[] args) throws IOException {
        File corpusFile = new File("inputfile.txt");// my annotated file
        File modelFile = new File("outputmodelfile.model"); 

        System.out.println("Setting up Chunker Estimator");
        TokenizerFactory factory
            = IndoEuropeanTokenizerFactory.INSTANCE;
        HmmCharLmEstimator hmmEstimator
            = new HmmCharLmEstimator(MAX_N_GRAM,NUM_CHARS,LM_INTERPOLATION);
        CharLmHmmChunker chunkerEstimator
            = new CharLmHmmChunker(factory,hmmEstimator);

        System.out.println("Setting up Data Parser");
//        Muc6ChunkParser parser = new Muc6ChunkParser();  
//        parser.setHandler( chunkerEstimator);

        System.out.println("Training with Data from File=" + corpusFile);
//        parser.parse(corpusFile);

        System.out.println("Compiling and Writing Model to File=" + modelFile);
        AbstractExternalizable.compileTo(chunkerEstimator,modelFile);
    }
 
Example #2
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 6 votes vote down vote up
private static void usingTheLingPipeStemmer() {
    String words[] = {"bank", "banking", "banks", "banker",
        "banked", "bankart"};
    TokenizerFactory tokenizerFactory
            = IndoEuropeanTokenizerFactory.INSTANCE;
    TokenizerFactory porterFactory
            = new PorterStemmerTokenizerFactory(tokenizerFactory);
    String[] stems = new String[words.length];
    for (int i = 0; i < words.length; i++) {
        com.aliasi.tokenizer.Tokenization tokenizer
                = new com.aliasi.tokenizer.Tokenization(words[i], porterFactory);
        stems = tokenizer.tokens();
        System.out.print("Word: " + words[i]);
        for (String stem : stems) {
            System.out.println("  Stem: " + stem);
        }
    }

}
 
Example #3
Source File: LuceneInMemorySentenceRetrievalExecutor.java    From bioasq with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
  super.initialize(context);
  // initialize sentence chunker
  TokenizerFactory tokenizerFactory = UimaContextHelper.createObjectFromConfigParameter(context,
          "tokenizer-factory", "tokenizer-factory-params", IndoEuropeanTokenizerFactory.class,
          TokenizerFactory.class);
  SentenceModel sentenceModel = UimaContextHelper.createObjectFromConfigParameter(context,
          "sentence-model", "sentence-model-params", IndoEuropeanSentenceModel.class,
          SentenceModel.class);
  chunker = new SentenceChunker(tokenizerFactory, sentenceModel);
  // initialize hits
  hits = UimaContextHelper.getConfigParameterIntValue(context, "hits", 200);
  // initialize query analyzer, index writer config, and query parser
  analyzer = UimaContextHelper.createObjectFromConfigParameter(context, "query-analyzer",
          "query-analyzer-params", StandardAnalyzer.class, Analyzer.class);
  parser = new QueryParser("text", analyzer);
  // initialize query string constructor
  queryStringConstructor = UimaContextHelper.createObjectFromConfigParameter(context,
          "query-string-constructor", "query-string-constructor-params",
          BooleanBagOfPhraseQueryStringConstructor.class, QueryStringConstructor.class);
}
 
Example #4
Source File: ImprovedLuceneInMemorySentenceRetrievalExecutor.java    From bioasq with Apache License 2.0 6 votes vote down vote up
@Override
public void initialize(UimaContext context) throws ResourceInitializationException {
  super.initialize(context);
  TokenizerFactory tokenizerFactory = UimaContextHelper.createObjectFromConfigParameter(context,
          "tokenizer-factory", "tokenizer-factory-params", IndoEuropeanTokenizerFactory.class,
          TokenizerFactory.class);
  SentenceModel sentenceModel = UimaContextHelper.createObjectFromConfigParameter(context,
          "sentence-model", "sentence-model-params", IndoEuropeanSentenceModel.class,
          SentenceModel.class);
  chunker = new SentenceChunker(tokenizerFactory, sentenceModel);
  // initialize hits
  hits = UimaContextHelper.getConfigParameterIntValue(context, "hits", 200);
  // initialize query analyzer, index writer config, and query parser
  analyzer = UimaContextHelper.createObjectFromConfigParameter(context, "query-analyzer",
          "query-analyzer-params", StandardAnalyzer.class, Analyzer.class);
  parser = new QueryParser("text", analyzer);
  // initialize query string constructor
  queryStringConstructor = UimaContextHelper.createObjectFromConfigParameter(context,
          "query-string-constructor", "query-string-constructor-params",
          BagOfPhraseQueryStringConstructor.class, QueryStringConstructor.class);
  String parserProviderName = UimaContextHelper
          .getConfigParameterStringValue(context, "parser-provider");
  parserProvider = ProviderCache.getProvider(parserProviderName, ParserProvider.class);

  lemma = new StanfordLemmatizer();
}
 
Example #5
Source File: Chapter2.java    From Natural-Language-Processing-with-Java-Second-Edition with MIT License 5 votes vote down vote up
private static void usingLingPipeTokenizers() {
//        String paragraph = "sample text string";
        char text[] = paragraph.toCharArray();
        TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE;
        com.aliasi.tokenizer.Tokenizer tokenizer = tokenizerFactory.tokenizer(
                text, 0, text.length);
        for (String token : tokenizer) {
            System.out.println(token);
        }
    }
 
Example #6
Source File: TweetHandler.java    From Java-for-Data-Science with MIT License 5 votes vote down vote up
public TweetHandler removeStopWords() {
    TokenizerFactory tokenizerFactory
            = IndoEuropeanTokenizerFactory.INSTANCE;
    tokenizerFactory = new EnglishStopTokenizerFactory(tokenizerFactory);
    Tokenizer tokens = tokenizerFactory.tokenizer(
            this.text.toCharArray(), 0, this.text.length());
    StringBuilder buffer = new StringBuilder();
    for (String word : tokens) {
        buffer.append(word + " ");
    }
    this.text = buffer.toString();
    return this;
}
 
Example #7
Source File: SimpleStringCleaning.java    From Java-for-Data-Science with MIT License 5 votes vote down vote up
public static void removeStopWithLing(String text){
	//******************EXAMPLE WITH ling pipe *******************************************************************************************
	//mention lower vs upper case
	out.println(text);
	text = text.toLowerCase().trim();
	TokenizerFactory fact = IndoEuropeanTokenizerFactory.INSTANCE;
	fact = new EnglishStopTokenizerFactory(fact);
	Tokenizer tok = fact.tokenizer(text.toCharArray(), 0, text.length());
	for(String word : tok){
		out.print(word + " ");
	}
}