org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter Java Examples

The following examples show how to use org.apache.lucene.analysis.miscellaneous.SetKeywordMarkerFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestMorfologikAnalyzer.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** */
public final void testKeywordAttrTokens() throws IOException {
  Analyzer a = new MorfologikAnalyzer() {
    @Override
    protected TokenStreamComponents createComponents(String field) {
      final CharArraySet keywords = new CharArraySet(1, false);
      keywords.add("liście");

      final Tokenizer src = new StandardTokenizer();
      TokenStream result = new SetKeywordMarkerFilter(src, keywords);
      result = new MorfologikFilter(result); 

      return new TokenStreamComponents(src, result);
    }
  };

  assertAnalyzesTo(
    a,
    "liście danych",
    new String[] { "liście", "dany", "dana", "dane", "dać" },
    new int[] { 0, 7, 7, 7, 7 },
    new int[] { 6, 13, 13, 13, 13 },
    new int[] { 1, 1, 0, 0, 0 });
  a.close();
}
 
Example #2
Source File: TestRussianLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("энергии"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new RussianLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "энергии", "энергии");
  a.close();
}
 
Example #3
Source File: KeywordMarkerTokenFilterFactory.java    From crate with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    if (keywordPattern != null) {
        return new PatternKeywordMarkerFilter(tokenStream, keywordPattern);
    } else {
        return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
    }
}
 
Example #4
Source File: TestCzechStemmer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("hole");
  final MockTokenizer in = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  in.setReader(new StringReader("hole desek"));
  CzechStemFilter filter = new CzechStemFilter(new SetKeywordMarkerFilter(
      in, set));
  assertTokenStreamContents(filter, new String[] { "hole", "desk" });
}
 
Example #5
Source File: TestPortugueseStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}
 
Example #6
Source File: TestPortugueseMinimalStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}
 
Example #7
Source File: TestPortugueseLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}
 
Example #8
Source File: TestPorterStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("yourselves");
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("yourselves yours"));
  TokenStream filter = new PorterStemFilter(new SetKeywordMarkerFilter(tokenizer, set));   
  assertTokenStreamContents(filter, new String[] {"yourselves", "your"});
}
 
Example #9
Source File: TestHunspellStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Simple test for KeywordAttribute */
public void testKeywordAttribute() throws IOException {
  MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome");
  tokenizer.setEnableChecks(true);
  HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary);
  assertTokenStreamContents(filter, new String[]{"lucene", "lucen", "is", "awesome"}, new int[] {1, 0, 1, 1});
  
  // assert with keyword marker
  tokenizer = whitespaceMockTokenizer("lucene is awesome");
  CharArraySet set = new CharArraySet( Arrays.asList("Lucene"), true);
  filter = new HunspellStemFilter(new SetKeywordMarkerFilter(tokenizer, set), dictionary);
  assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1});
}
 
Example #10
Source File: TestFrenchMinimalStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer( MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FrenchMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "chevaux", "chevaux");
  a.close();
}
 
Example #11
Source File: TestFrenchLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("chevaux"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FrenchLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "chevaux", "chevaux");
  a.close();
}
 
Example #12
Source File: TestBrazilianAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("Brasília");
  Tokenizer tokenizer = new LetterTokenizer();
  tokenizer.setReader(new StringReader("Brasília Brasilia"));
  BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set));

  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}
 
Example #13
Source File: TestGalicianMinimalStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("elefantes"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new GalicianMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "elefantes", "elefantes");
  a.close();
}
 
Example #14
Source File: TestBulgarianStemmer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("строеве");
  MockTokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(new StringReader("строевете строеве"));

  BulgarianStemFilter filter = new BulgarianStemFilter(
      new SetKeywordMarkerFilter(tokenStream, set));
  assertTokenStreamContents(filter, new String[] { "строй", "строеве" });
}
 
Example #15
Source File: TestHungarianLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("babakocsi"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new HungarianLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "babakocsi", "babakocsi");
  a.close();
}
 
Example #16
Source File: TestSwedishLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("jaktkarlens"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new SwedishLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "jaktkarlens", "jaktkarlens");
  a.close();
}
 
Example #17
Source File: TestArabicStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("ساهدهات");
  MockTokenizer tokenStream  = whitespaceMockTokenizer("ساهدهات");

  ArabicStemFilter filter = new ArabicStemFilter(new SetKeywordMarkerFilter(tokenStream, set));
  assertTokenStreamContents(filter, new String[]{"ساهدهات"});
}
 
Example #18
Source File: TestJapaneseNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testName() throws IOException {
  // Test name that normalises to number
  assertAnalyzesTo(analyzer, "田中京一",
      new String[]{"田中", "10000000000000001"}, // 京一 is normalized to a number
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );

  // An analyzer that marks 京一 as a keyword
  Analyzer keywordMarkingAnalyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      CharArraySet set = new CharArraySet(1, false);
      set.add("京一");

      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set)));
    }
  };

  assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一",
      new String[]{"田中", "京一"}, // 京一 is not normalized
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );
  keywordMarkingAnalyzer.close();
}
 
Example #19
Source File: TestJapaneseBaseFormFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet(asSet("あり"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new JapaneseTokenizer(newAttributeFactory(), null, true, JapaneseTokenizer.DEFAULT_MODE);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new JapaneseBaseFormFilter(sink));
    }
  };
  assertAnalyzesTo(a, "それはまだ実験段階にあります",
      new String[] { "それ", "は", "まだ", "実験", "段階", "に", "あり", "ます"  }
  );
  a.close();
}
 
Example #20
Source File: TestJapaneseKatakanaStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet(asSet("コーヒー"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new JapaneseKatakanaStemFilter(sink));
    }
  };
  checkOneTerm(a, "コーヒー", "コーヒー");
  a.close();
}
 
Example #21
Source File: TestKoreanNumberFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testName() throws IOException {
  // Test name that normalises to number
  assertAnalyzesTo(analyzer, "전중경일",
      new String[]{"전중", "10000000000000001"}, // 경일 is normalized to a number
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );

  // An analyzer that marks 경일 as a keyword
  Analyzer keywordMarkingAnalyzer = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      CharArraySet set = new CharArraySet(1, false);
      set.add("경일");
      UserDictionary userDictionary = readDict();
      Set<POS.Tag> stopTags = new HashSet<>();
      stopTags.add(POS.Tag.SP);
      Tokenizer tokenizer = new KoreanTokenizer(newAttributeFactory(), userDictionary,
          KoreanTokenizer.DEFAULT_DECOMPOUND, false, false);
      TokenStream stream = new KoreanPartOfSpeechStopFilter(tokenizer, stopTags);
      return new TokenStreamComponents(tokenizer, new KoreanNumberFilter(new SetKeywordMarkerFilter(stream, set)));
    }
  };

  assertAnalyzesTo(keywordMarkingAnalyzer, "전중경일",
      new String[]{"전중", "경일"}, // 경일 is not normalized
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );
  keywordMarkingAnalyzer.close();
}
 
Example #22
Source File: TestGermanStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new GermanStemFilter(sink));
    }
  };
  checkOneTerm(a, "sängerinnen", "sängerinnen");
  a.close();
}
 
Example #23
Source File: TestGermanMinimalStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new GermanMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "sängerinnen", "sängerinnen");
  a.close();
}
 
Example #24
Source File: TestGermanAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("fischen");
  final Tokenizer in = new LetterTokenizer();
  in.setReader(new StringReader("Fischen Trinken"));
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
 
Example #25
Source File: TestGermanLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sängerinnen"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new GermanLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "sängerinnen", "sängerinnen");
  a.close();
}
 
Example #26
Source File: TestNorwegianMinimalStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sekretæren"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new NorwegianMinimalStemFilter(sink));
    }
  };
  checkOneTerm(a, "sekretæren", "sekretæren");
  a.close();
}
 
Example #27
Source File: TestNorwegianLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("sekretæren"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new NorwegianLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "sekretæren", "sekretæren");
  a.close();
}
 
Example #28
Source File: TestFinnishLightStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("edeltäjistään"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new FinnishLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "edeltäjistään", "edeltäjistään");
  a.close();
}
 
Example #29
Source File: BrazilianStemTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    return new BrazilianStemFilter(new SetKeywordMarkerFilter(tokenStream, exclusions));
}
 
Example #30
Source File: KeywordMarkerTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    return new SetKeywordMarkerFilter(tokenStream, keywordLookup);
}