com.aliasi.tokenizer.TokenizerFactory Java Examples
The following examples show how to use
com.aliasi.tokenizer.TokenizerFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TrainEntities.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 7 votes |
public static void main(String[] args) throws IOException { File corpusFile = new File("inputfile.txt");// my annotated file File modelFile = new File("outputmodelfile.model"); System.out.println("Setting up Chunker Estimator"); TokenizerFactory factory = IndoEuropeanTokenizerFactory.INSTANCE; HmmCharLmEstimator hmmEstimator = new HmmCharLmEstimator(MAX_N_GRAM,NUM_CHARS,LM_INTERPOLATION); CharLmHmmChunker chunkerEstimator = new CharLmHmmChunker(factory,hmmEstimator); System.out.println("Setting up Data Parser"); // Muc6ChunkParser parser = new Muc6ChunkParser(); // parser.setHandler( chunkerEstimator); System.out.println("Training with Data from File=" + corpusFile); // parser.parse(corpusFile); System.out.println("Compiling and Writing Model to File=" + modelFile); AbstractExternalizable.compileTo(chunkerEstimator,modelFile); }
Example #2
Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 6 votes |
private static void usingTheLingPipeStemmer() { String words[] = {"bank", "banking", "banks", "banker", "banked", "bankart"}; TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE; TokenizerFactory porterFactory = new PorterStemmerTokenizerFactory(tokenizerFactory); String[] stems = new String[words.length]; for (int i = 0; i < words.length; i++) { com.aliasi.tokenizer.Tokenization tokenizer = new com.aliasi.tokenizer.Tokenization(words[i], porterFactory); stems = tokenizer.tokens(); System.out.print("Word: " + words[i]); for (String stem : stems) { System.out.println(" Stem: " + stem); } } }
Example #3
Source File: LuceneInMemorySentenceRetrievalExecutor.java From bioasq with Apache License 2.0 | 6 votes |
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); // initialize sentence chunker TokenizerFactory tokenizerFactory = UimaContextHelper.createObjectFromConfigParameter(context, "tokenizer-factory", "tokenizer-factory-params", IndoEuropeanTokenizerFactory.class, TokenizerFactory.class); SentenceModel sentenceModel = UimaContextHelper.createObjectFromConfigParameter(context, "sentence-model", "sentence-model-params", IndoEuropeanSentenceModel.class, SentenceModel.class); chunker = new SentenceChunker(tokenizerFactory, sentenceModel); // initialize hits hits = UimaContextHelper.getConfigParameterIntValue(context, "hits", 200); // initialize query analyzer, index writer config, and query parser analyzer = UimaContextHelper.createObjectFromConfigParameter(context, "query-analyzer", "query-analyzer-params", StandardAnalyzer.class, Analyzer.class); parser = new QueryParser("text", analyzer); // initialize query string constructor queryStringConstructor = UimaContextHelper.createObjectFromConfigParameter(context, "query-string-constructor", "query-string-constructor-params", BooleanBagOfPhraseQueryStringConstructor.class, QueryStringConstructor.class); }
Example #4
Source File: ImprovedLuceneInMemorySentenceRetrievalExecutor.java From bioasq with Apache License 2.0 | 6 votes |
@Override public void initialize(UimaContext context) throws ResourceInitializationException { super.initialize(context); TokenizerFactory tokenizerFactory = UimaContextHelper.createObjectFromConfigParameter(context, "tokenizer-factory", "tokenizer-factory-params", IndoEuropeanTokenizerFactory.class, TokenizerFactory.class); SentenceModel sentenceModel = UimaContextHelper.createObjectFromConfigParameter(context, "sentence-model", "sentence-model-params", IndoEuropeanSentenceModel.class, SentenceModel.class); chunker = new SentenceChunker(tokenizerFactory, sentenceModel); // initialize hits hits = UimaContextHelper.getConfigParameterIntValue(context, "hits", 200); // initialize query analyzer, index writer config, and query parser analyzer = UimaContextHelper.createObjectFromConfigParameter(context, "query-analyzer", "query-analyzer-params", StandardAnalyzer.class, Analyzer.class); parser = new QueryParser("text", analyzer); // initialize query string constructor queryStringConstructor = UimaContextHelper.createObjectFromConfigParameter(context, "query-string-constructor", "query-string-constructor-params", BagOfPhraseQueryStringConstructor.class, QueryStringConstructor.class); String parserProviderName = UimaContextHelper .getConfigParameterStringValue(context, "parser-provider"); parserProvider = ProviderCache.getProvider(parserProviderName, ParserProvider.class); lemma = new StanfordLemmatizer(); }
Example #5
Source File: Chapter2.java From Natural-Language-Processing-with-Java-Second-Edition with MIT License | 5 votes |
private static void usingLingPipeTokenizers() { // String paragraph = "sample text string"; char text[] = paragraph.toCharArray(); TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE; com.aliasi.tokenizer.Tokenizer tokenizer = tokenizerFactory.tokenizer( text, 0, text.length); for (String token : tokenizer) { System.out.println(token); } }
Example #6
Source File: TweetHandler.java From Java-for-Data-Science with MIT License | 5 votes |
public TweetHandler removeStopWords() { TokenizerFactory tokenizerFactory = IndoEuropeanTokenizerFactory.INSTANCE; tokenizerFactory = new EnglishStopTokenizerFactory(tokenizerFactory); Tokenizer tokens = tokenizerFactory.tokenizer( this.text.toCharArray(), 0, this.text.length()); StringBuilder buffer = new StringBuilder(); for (String word : tokens) { buffer.append(word + " "); } this.text = buffer.toString(); return this; }
Example #7
Source File: SimpleStringCleaning.java From Java-for-Data-Science with MIT License | 5 votes |
public static void removeStopWithLing(String text){ //******************EXAMPLE WITH ling pipe ******************************************************************************************* //mention lower vs upper case out.println(text); text = text.toLowerCase().trim(); TokenizerFactory fact = IndoEuropeanTokenizerFactory.INSTANCE; fact = new EnglishStopTokenizerFactory(fact); Tokenizer tok = fact.tokenizer(text.toCharArray(), 0, text.length()); for(String word : tok){ out.print(word + " "); } }