org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter Java Examples
The following examples show how to use
org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestMorfologikAnalyzer.java From lucene-solr with Apache License 2.0 | 6 votes |
/** */ public final void testKeywordAttrTokens() throws IOException { Analyzer a = new MorfologikAnalyzer() { @Override protected TokenStreamComponents createComponents(String field) { final CharArraySet keywords = new CharArraySet(1, false); keywords.add("liście"); final Tokenizer src = new StandardTokenizer(); TokenStream result = new SetKeywordMarkerFilter(src, keywords); result = new MorfologikFilter(result); return new TokenStreamComponents(src, result); } }; assertAnalyzesTo( a, "liście danych", new String[] { "liście", "dany", "dana", "dane", "dać" }, new int[] { 0, 7, 7, 7, 7 }, new int[] { 6, 13, 13, 13, 13 }, new int[] { 1, 1, 0, 0, 0 }); a.close(); }
Example #2
Source File: TestRussianLightStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("энергии"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new RussianLightStemFilter(sink)); } }; checkOneTerm(a, "энергии", "энергии"); a.close(); }
Example #3
Source File: KeywordMarkerTokenFilterFactory.java From crate with Apache License 2.0 | 5 votes |
@Override public TokenStream create(TokenStream tokenStream) { if (keywordPattern != null) { return new PatternKeywordMarkerFilter(tokenStream, keywordPattern); } else { return new SetKeywordMarkerFilter(tokenStream, keywordLookup); } }
Example #4
Source File: TestCzechStemmer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(1, true); set.add("hole"); final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false); in.setReader(new StringReader("hole desek")); CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter( in, set)); assertTokenStreamContents(filter, new String[] { "hole", "desk" }); }
Example #5
Source File: TestPortugueseStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new PortugueseStemFilter(sink)); } }; checkOneTerm(a, "quilométricas", "quilométricas"); a.close(); }
Example #6
Source File: TestPortugueseMinimalStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink)); } }; checkOneTerm(a, "quilométricas", "quilométricas"); a.close(); }
Example #7
Source File: TestPortugueseLightStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink)); } }; checkOneTerm(a, "quilométricas", "quilométricas"); a.close(); }
Example #8
Source File: TestPorterStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet( 1, true); set.add("yourselves"); Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setReader(new StringReader("yourselves yours")); TokenStream filter = new PorterStemFilter(new SetKeywordMarkerFilter(tokenizer, set)); assertTokenStreamContents(filter, new String[] {"yourselves", "your"}); }
Example #9
Source File: TestHunspellStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Simple test for KeywordAttribute */ public void testKeywordAttribute() throws IOException { MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); tokenizer.setEnableChecks(true); HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary); assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1}); // assert with keyword marker tokenizer = whitespaceMockTokenizer("lucene is awesome"); CharArraySet set = new CharArraySet( Arrays.asList("Lucene"), true); filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary); assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); }
Example #10
Source File: TestFrenchMinimalStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer( MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink)); } }; checkOneTerm(a, "chevaux", "chevaux"); a.close(); }
Example #11
Source File: TestFrenchLightStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new FrenchLightStemFilter(sink)); } }; checkOneTerm(a, "chevaux", "chevaux"); a.close(); }
Example #12
Source File: TestBrazilianAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(1, true); set.add("Brasília"); Tokenizer tokenizer = new LetterTokenizer(); tokenizer.setReader(new StringReader("Brasília Brasilia")); BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set)); assertTokenStreamContents(filter, new String[] { "brasília", "brasil" }); }
Example #13
Source File: TestGalicianMinimalStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("elefantes"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink)); } }; checkOneTerm(a, "elefantes", "elefantes"); a.close(); }
Example #14
Source File: TestBulgarianStemmer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(1, true); set.add("строеве"); MockTokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenStream.setReader(new StringReader("строевете строеве")); BulgarianStemFilter filter = new BulgarianStemFilter( new SetKeywordMarkerFilter(tokenStream, set)); assertTokenStreamContents(filter, new String[] { "строй", "строеве" }); }
Example #15
Source File: TestHungarianLightStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("babakocsi"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new HungarianLightStemFilter(sink)); } }; checkOneTerm(a, "babakocsi", "babakocsi"); a.close(); }
Example #16
Source File: TestSwedishLightStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("jaktkarlens"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new SwedishLightStemFilter(sink)); } }; checkOneTerm(a, "jaktkarlens", "jaktkarlens"); a.close(); }
Example #17
Source File: TestArabicStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet(1, true); set.add("ساهدهات"); MockTokenizer tokenStream = whitespaceMockTokenizer("ساهدهات"); ArabicStemFilter filter = new ArabicStemFilter(new SetKeywordMarkerFilter(tokenStream, set)); assertTokenStreamContents(filter, new String[]{"ساهدهات"}); }
Example #18
Source File: TestJapaneseNumberFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Test public void testName() throws IOException { // Test name that normalises to number assertAnalyzesTo(analyzer, "田中京一", new String[]{"田中", "10000000000000001"}, // 京一 is normalized to a number new int[]{0, 2}, new int[]{2, 4}, new int[]{1, 1} ); // An analyzer that marks 京一 as a keyword Analyzer keywordMarkingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { CharArraySet set = new CharArraySet(1, false); set.add("京一"); Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH); return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set))); } }; assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一", new String[]{"田中", "京一"}, // 京一 is not normalized new int[]{0, 2}, new int[]{2, 4}, new int[]{1, 1} ); keywordMarkingAnalyzer.close(); }
Example #19
Source File: TestJapaneseBaseFormFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet(asSet("あり"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.DEFAULT_MODE); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink)); } }; assertAnalyzesTo(a, "それはまだ実験段階にあります", new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます" } ); a.close(); }
Example #20
Source File: TestJapaneseKatakanaStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet(asSet("コーヒー"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink)); } }; checkOneTerm(a, "コーヒー", "コーヒー"); a.close(); }
Example #21
Source File: TestKoreanNumberFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Test public void testName() throws IOException { // Test name that normalises to number assertAnalyzesTo(analyzer, "전중경일", new String[]{"전중", "10000000000000001"}, // 경일 is normalized to a number new int[]{0, 2}, new int[]{2, 4}, new int[]{1, 1} ); // An analyzer that marks 경일 as a keyword Analyzer keywordMarkingAnalyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { CharArraySet set = new CharArraySet(1, false); set.add("경일"); UserDictionary userDictionary = readDict(); Set<POS.Tag> stopTags = new HashSet<>(); stopTags.add(POS.Tag.SP); Tokenizer tokenizer = new KoreanTokenizer(newAttributeFactory(), userDictionary, KoreanTokenizer.DEFAULT_DECOMPOUND, false, false); TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags); return new TokenStreamComponents(tokenizer, new KoreanNumberFilter(new SetKeywordMarkerFilter(stream, set))); } }; assertAnalyzesTo(keywordMarkingAnalyzer, "전중경일", new String[]{"전중", "경일"}, // 경일 is not normalized new int[]{0, 2}, new int[]{2, 4}, new int[]{1, 1} ); keywordMarkingAnalyzer.close(); }
Example #22
Source File: TestGermanStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new GermanStemFilter(sink)); } }; checkOneTerm(a, "sängerinnen", "sängerinnen"); a.close(); }
Example #23
Source File: TestGermanMinimalStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink)); } }; checkOneTerm(a, "sängerinnen", "sängerinnen"); a.close(); }
Example #24
Source File: TestGermanAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet( 1, true); set.add("fischen"); final Tokenizer in = new LetterTokenizer(); in.setReader(new StringReader("Fischen Trinken")); GermanStemFilter filter = new GermanStemFilter( new SetKeywordMarkerFilter(new LowerCaseFilter(in), set)); assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); }
Example #25
Source File: TestGermanLightStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new GermanLightStemFilter(sink)); } }; checkOneTerm(a, "sängerinnen", "sängerinnen"); a.close(); }
Example #26
Source File: TestNorwegianMinimalStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("sekretæren"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink)); } }; checkOneTerm(a, "sekretæren", "sekretæren"); a.close(); }
Example #27
Source File: TestNorwegianLightStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("sekretæren"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink)); } }; checkOneTerm(a, "sekretæren", "sekretæren"); a.close(); }
Example #28
Source File: TestFinnishLightStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testKeyword() throws IOException { final CharArraySet exclusionSet = new CharArraySet( asSet("edeltäjistään"), false); Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false); TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet); return new TokenStreamComponents(source, new FinnishLightStemFilter(sink)); } }; checkOneTerm(a, "edeltäjistään", "edeltäjistään"); a.close(); }
Example #29
Source File: BrazilianStemTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 4 votes |
@Override public TokenStream create(TokenStream tokenStream) { return new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenStream, exclusions)); }
Example #30
Source File: KeywordMarkerTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 4 votes |
@Override public TokenStream create(TokenStream tokenStream) { return new SetKeywordMarkerFilter(tokenStream, keywordLookup); }