org.apache.lucene.analysis.core.WhitespaceTokenizer Java Examples
The following examples show how to use
org.apache.lucene.analysis.core.WhitespaceTokenizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: XmlInterpolationTest.java From lucene-solr with Apache License 2.0 | 6 votes |
private String[] analyzeReturnTokens(String docText) { List<String> result = new ArrayList<>(); Reader filter = new HTMLStripCharFilter(new StringReader(docText), Collections.singleton("unescaped")); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { result.add(termAttribute.toString()); } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result.toArray(new String[result.size()]); }
Example #2
Source File: ConcatenateFilterTest.java From SolrTextTagger with Apache License 2.0 | 6 votes |
public void testTypical() throws IOException { String NYC = "new york city"; WhitespaceTokenizer stream = new WhitespaceTokenizer(); stream.setReader(new StringReader(NYC)); ConcatenateFilter filter = new ConcatenateFilter(stream); try { assertTokenStreamContents(filter, new String[]{NYC}, new int[]{0}, new int[]{NYC.length()}, new String[]{"shingle"}, new int[]{1}, null, NYC.length(), true); } catch (AssertionError e) { //assertTokenStreamContents tries to test if tokenStream.end() was implemented correctly. // It's manner of checking this is imperfect and incompatible with // ConcatenateFilter. Specifically it modifies a special attribute *after* incrementToken(), // which is weird. To the best of my ability, end() appears to be implemented correctly. if (!e.getMessage().equals("super.end()/clearAttributes() was not called correctly in end()")) throw e; } }
Example #3
Source File: DatasetAnalyzer.java From gerbil with GNU Affero General Public License v3.0 | 6 votes |
private int countTokensInText(String text) { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader(text)); int tokens = 0; try { tokenizer.reset(); while (tokenizer.incrementToken()) { ++tokens; } } catch (Exception e) { LOGGER.error("Error while tokenizing text. Returning.", e); } finally { IOUtils.closeQuietly(tokenizer); } return tokens; }
Example #4
Source File: XmlInterpolationTest.java From SolrTextTagger with Apache License 2.0 | 6 votes |
private String[] analyzeReturnTokens(String docText) { List<String> result = new ArrayList<>(); Reader filter = new HTMLStripCharFilter(new StringReader(docText), Collections.singleton("unescaped")); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { result.add(termAttribute.toString()); } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result.toArray(new String[result.size()]); }
Example #5
Source File: CASAnalyzer.java From oodt with Apache License 2.0 | 6 votes |
@Override protected TokenStreamComponents createComponents(String fieldName) { TokenStream result = new WhitespaceTokenizer(/*reader*/); /*result = new StandardFilter(result); result = new StopFilter(result, STOP_WORDS); try { result.reset(); } catch (IOException e) { e.printStackTrace(); } StandardTokenizer tokenizer = new StandardTokenizer(factory); return new TokenStreamComponents(tokenizer, result);*/ return new TokenStreamComponents(new WhitespaceTokenizer()); }
Example #6
Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testIncompletePhrase() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "big apple", "new york city", "property tax", "three word phrase"), false); final String input = "some new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("some", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("new", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("york", term.toString()); }
Example #7
Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testOverlappingAtEnd() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "new york", "new york city", "city of new york"), false); final String input = "the great city of new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("the", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("great", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("city_of_new_york", term.toString()); }
Example #8
Source File: ShingleFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testReset() throws Exception { Tokenizer wsTokenizer = new WhitespaceTokenizer(); wsTokenizer.setReader(new StringReader("please divide this sentence")); TokenStream filter = new ShingleFilter(wsTokenizer, 2); assertTokenStreamContents(filter, new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1} ); wsTokenizer.setReader(new StringReader("please divide this sentence")); assertTokenStreamContents(filter, new String[]{"please","please divide","divide","divide this","this","this sentence","sentence"}, new int[]{0,0,7,7,14,14,19}, new int[]{6,13,13,18,18,27,27}, new String[]{TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE,"shingle",TypeAttribute.DEFAULT_TYPE}, new int[]{1,0,1,0,1,0,1} ); }
Example #9
Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testQueryReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(); wt.setReader(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CommonGramsQueryFilter nsf = new CommonGramsQueryFilter(cgf); CharTermAttribute term = wt.addAttribute(CharTermAttribute.class); nsf.reset(); assertTrue(nsf.incrementToken()); assertEquals("How_the", term.toString()); assertTrue(nsf.incrementToken()); assertEquals("the_s", term.toString()); nsf.close(); wt.setReader(new StringReader(input)); nsf.reset(); assertTrue(nsf.incrementToken()); assertEquals("How_the", term.toString()); }
Example #10
Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testReset() throws Exception { final String input = "How the s a brown s cow d like A B thing?"; WhitespaceTokenizer wt = new WhitespaceTokenizer(); wt.setReader(new StringReader(input)); CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords); CharTermAttribute term = cgf.addAttribute(CharTermAttribute.class); cgf.reset(); assertTrue(cgf.incrementToken()); assertEquals("How", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("How_the", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("the", term.toString()); assertTrue(cgf.incrementToken()); assertEquals("the_s", term.toString()); cgf.close(); wt.setReader(new StringReader(input)); cgf.reset(); assertTrue(cgf.incrementToken()); assertEquals("How", term.toString()); }
Example #11
Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testOverlappingAtBeginning() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "new york", "new york city", "city of new york"), false); final String input = "new york city is great"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("new_york_city", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("is", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("great", term.toString()); }
Example #12
Source File: CommonAnalysisPlugin.java From crate with Apache License 2.0 | 6 votes |
@Override public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() { List<PreConfiguredTokenizer> tokenizers = new ArrayList<>(); tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram", () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null)); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null)); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() { @Override public String name() { return "lowercase"; } @Override public TokenStream create(TokenStream tokenStream) { return new LowerCaseFilter(tokenStream); } })); // Temporary shim for aliases. TODO deprecate after they are moved tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null)); return tokenizers; }
Example #13
Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testOverlappingAtEndEmitSingle() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "new york", "new york city", "city of new york"), false); final String input = "the great city of new york"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("the", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("great", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("city", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("of", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("new", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("york", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("city_of_new_york", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("new_york", term.toString()); }
Example #14
Source File: EntityAnalyzer.java From SciGraph with Apache License 2.0 | 5 votes |
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new WhitespaceTokenizer(); TokenStream result = new PatternReplaceFilter(tokenizer, Pattern.compile("^([\\.!\\?,:;\"'\\(\\)]*)(.*?)([\\.!\\?,:;\"'\\(\\)]*)$"), "$2", true); result = new PatternReplaceFilter(result, Pattern.compile("'s"), "s", true); return new TokenStreamComponents(tokenizer, result); }
Example #15
Source File: AnnotationAnalyzer.java From elasticsearch-analysis-annotation with Apache License 2.0 | 5 votes |
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer source = new WhitespaceTokenizer(version, reader); TokenStream filter = new LowerCaseFilter(version, source); filter = new InlineAnnotationFilter(filter); return new TokenStreamComponents(source, filter); }
Example #16
Source File: XmlInterpolationTest.java From SolrTextTagger with Apache License 2.0 | 5 votes |
private int[] analyzeTagOne(String docText, String start, String end) { int[] result = {-1, -1}; Reader filter = new HTMLStripCharFilter(new StringReader(docText)); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { final String termString = termAttribute.toString(); if (termString.equals(start)) result[0] = offsetAttribute.startOffset(); if (termString.equals(end)) { result[1] = offsetAttribute.endOffset(); return result; } } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result; }
Example #17
Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testOverlappingAtBeginningEmitSingle() throws Exception { final CharArraySet phraseSets = new CharArraySet(Arrays.asList( "new york", "new york city", "city of new york"), false); final String input = "new york city is great"; StringReader reader = new StringReader(input); final WhitespaceTokenizer in = new WhitespaceTokenizer(); in.setReader(reader); AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, true); aptf.setReplaceWhitespaceWith('_'); CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class); aptf.reset(); assertTrue(aptf.incrementToken()); assertEquals("new", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("york", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("new_york", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("new_york_city", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("city", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("is", term.toString()); assertTrue(aptf.incrementToken()); assertEquals("great", term.toString()); }
Example #18
Source File: ContentAnalyzer.java From modernmt with Apache License 2.0 | 5 votes |
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(reader); TokenStream filter; filter = new PunctuationFilter(tokenizer); if (shingleSize > 0) { ShingleFilter shingleFilter = new ShingleFilter(filter, shingleSize, shingleSize); shingleFilter.setOutputUnigrams(outputUnigrams); filter = shingleFilter; } return new TokenStreamComponents(tokenizer, filter); }
Example #19
Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0 | 5 votes |
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("ABADIAS")); String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia", "abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS", "obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" }; BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }
Example #20
Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0 | 5 votes |
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("Rimbault")); String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt", "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" }; BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }
Example #21
Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0 | 5 votes |
public void testPhoneticTokenFilterDaitchMotokoff() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("chauptman")); String[] expected = new String[] { "473660", "573660" }; assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class)); BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }
Example #22
Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0 | 5 votes |
@NotNull private static Analyzer concatenatingAnalyzer() { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull String field) { Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); TokenFilter concatenatingFilter = new ConcatenatingFilter(defaultTokenFilter(source), ' '); return new TokenStreamComponents(source, concatenatingFilter); } }; }
Example #23
Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0 | 5 votes |
@NotNull private static Analyzer wordDelimiterAnalyzer() { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull String field) { Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); return new TokenStreamComponents(source, defaultTokenFilter(source)); } }; }
Example #24
Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0 | 5 votes |
@NotNull private static Analyzer spellcheckAnalyzer(@NotNull SpellChecker spellChecker) { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull String field) { Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); SpellCheckerTokenFilter spellCheckFilter = new SpellCheckerTokenFilter(defaultTokenFilter(source), spellChecker); TokenFilter concatenatingFilter = new ConcatenatingFilter(spellCheckFilter, ' '); return new TokenStreamComponents(source, concatenatingFilter); } }; }
Example #25
Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0 | 5 votes |
@NotNull private static Analyzer createShingleAnalyzer(int maxShingles) { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull String field) { Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); ShingleFilter shingleFilter = new ShingleFilter(defaultTokenFilter(source), maxShingles); shingleFilter.setOutputUnigrams(true); return new TokenStreamComponents(source, shingleFilter); } }; }
Example #26
Source File: XmlInterpolationTest.java From lucene-solr with Apache License 2.0 | 5 votes |
private int[] analyzeTagOne(String docText, String start, String end) { int[] result = {-1, -1}; Reader filter = new HTMLStripCharFilter(new StringReader(docText)); WhitespaceTokenizer ts = new WhitespaceTokenizer(); final CharTermAttribute termAttribute = ts.addAttribute(CharTermAttribute.class); final OffsetAttribute offsetAttribute = ts.addAttribute(OffsetAttribute.class); try { ts.setReader(filter); ts.reset(); while (ts.incrementToken()) { final String termString = termAttribute.toString(); if (termString.equals(start)) result[0] = offsetAttribute.startOffset(); if (termString.equals(end)) { result[1] = offsetAttribute.endOffset(); return result; } } ts.end(); } catch (IOException e) { throw new RuntimeException(e); } finally { IOUtils.closeQuietly(ts); } return result; }
Example #27
Source File: NGramTokenFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("abcde")); NGramTokenFilter filter = new NGramTokenFilter(tokenizer, 1, 1, false); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","b","c","d","e"}, new int[]{0,0,0,0,0}, new int[]{5,5,5,5,5}, new int[]{1,0,0,0,0}); }
Example #28
Source File: EdgeNGramTokenFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testReset() throws Exception { WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("abcde")); EdgeNGramTokenFilter filter = new EdgeNGramTokenFilter(tokenizer, 1, 3, false); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); tokenizer.setReader(new StringReader("abcde")); assertTokenStreamContents(filter, new String[]{"a","ab","abc"}, new int[]{0,0,0}, new int[]{5,5,5}); }
Example #29
Source File: WhitespaceTokenizerFactory.java From crate with Apache License 2.0 | 4 votes |
@Override public Tokenizer create() { return new WhitespaceTokenizer(TokenStream.DEFAULT_TOKEN_ATTRIBUTE_FACTORY, maxTokenLength); }
Example #30
Source File: Zemberek2StemFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0 | 4 votes |
public static void main(String[] args) throws IOException { StringReader reader = new StringReader("elması utansın ortaklar çekişme ile"); Map<String, String> map = new HashMap<>(); map.put("strategy", "frequency"); Zemberek2StemFilterFactory factory = new Zemberek2StemFilterFactory(map); WhitespaceTokenizer whitespaceTokenizer = new WhitespaceTokenizer(); whitespaceTokenizer.setReader(reader); TokenStream stream = factory.create(whitespaceTokenizer); CharTermAttribute termAttribute = stream.getAttribute(CharTermAttribute.class); stream.reset(); while (stream.incrementToken()) { String term = termAttribute.toString(); System.out.println(term); } stream.end(); reader.close(); }