org.apache.lucene.analysis.CharArraySet Java Exaples

Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testHyphenationCompoundWordsDELongestMatch() throws Exception {
  CharArraySet dict = makeDictionary("basketball", "basket", "ball", "kurv");

  InputSource is = new InputSource(getClass().getResource("da_UTF8.xml").toExternalForm());
  HyphenationTree hyphenator = HyphenationCompoundWordTokenFilter
      .getHyphenationTree(is);

  // the word basket will not be added due to the longest match option
  HyphenationCompoundWordTokenFilter tf = new HyphenationCompoundWordTokenFilter(
      whitespaceMockTokenizer("basketballkurv"),
      hyphenator, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, 40, true);
  assertTokenStreamContents(tf, 
      new String[] { "basketballkurv", "basketball", "ball", "kurv" },
      new int[] { 1, 0, 0, 0 }
  );

}

Source File: Stemmer.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Find the unique stem(s) of the provided word
 * 
 * @param word Word to find the stems for
 * @return List of stems for the word
 */
public List<CharsRef> uniqueStems(char word[], int length) {
  List<CharsRef> stems = stem(word, length);
  if (stems.size() < 2) {
    return stems;
  }
  CharArraySet terms = new CharArraySet(8, dictionary.ignoreCase);
  List<CharsRef> deduped = new ArrayList<>();
  for (CharsRef s : stems) {
    if (!terms.contains(s)) {
      deduped.add(s);
      terms.add(s);
    }
  }
  return deduped;
}

Source File: TestStopAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public void testStopList() throws IOException {
  CharArraySet stopWordsSet = new CharArraySet(asSet("good", "test", "analyzer"), false);
  StopAnalyzer newStop = new StopAnalyzer(stopWordsSet);
  try (TokenStream stream = newStop.tokenStream("test", "This is a good test of the english stop analyzer")) {
    assertNotNull(stream);
    CharTermAttribute termAtt = stream.getAttribute(CharTermAttribute.class);
  
    stream.reset();
    while (stream.incrementToken()) {
      String text = termAtt.toString();
      assertFalse(stopWordsSet.contains(text));
    }
    stream.end();
  }
  newStop.close();
}

Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testEndIsStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            6,
                            new boolean[] {false},
                            true);
}

Source File: IEX2LevAMAZON.java From Clusion with GNU General Public License v3.0

6 votes

public void map(Text key, Text value, Context context) throws IOException, InterruptedException {
	String line = value.toString();

	CharArraySet noise = EnglishAnalyzer.getDefaultStopSet();
	// We are using a standard tokenizer that eliminates the stop words.
	// We can use Stemming tokenizer such Porter
	// A set of English noise keywords is used that will eliminates
	// words such as "the, a, etc"
	Analyzer analyzer = new StandardAnalyzer(noise);
	List<String> token = Tokenizer.tokenizeString(analyzer, line);
	Iterator<String> it = token.iterator();
	while (it.hasNext()) {
		word.set(it.next());
		fileName.set(key);
		if (!mapTable.containsKey(fileName.toString() + word.toString())) {
			context.write(fileName, word);
			mapTable.put(fileName.toString() + word.toString(), new IntWritable(1));
		}
	}
}

Source File: TestSuggestStopFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testMidStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to school"));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go", "school"},
                            new int[] {0, 6},
                            new int[] {2, 12},
                            null,
                            new int[] {1, 2},
                            null,
                            12,
                            new boolean[] {false, false},
                            true);
}

Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testReset() throws Exception {
  CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz",
      "Aufgabe", "Überwachung");

  MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wsTokenizer.setEnableChecks(false); // we will reset in a strange place
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      wsTokenizer, dict,
      CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
      CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);
  
  CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class);
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
  assertTrue(tf.incrementToken());
  assertEquals("Rind", termAtt.toString());
  tf.end();
  tf.close();
  wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz"));
  tf.reset();
  assertTrue(tf.incrementToken());
  assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString());
}

Source File: CompoundWordTokenFilterBase.java From lucene-solr with Apache License 2.0

6 votes

protected CompoundWordTokenFilterBase(TokenStream input, CharArraySet dictionary, int minWordSize, int minSubwordSize, int maxSubwordSize, boolean onlyLongestMatch) {
  super(input);
  this.tokens=new LinkedList<>();
  if (minWordSize < 0) {
    throw new IllegalArgumentException("minWordSize cannot be negative");
  }
  this.minWordSize=minWordSize;
  if (minSubwordSize < 0) {
    throw new IllegalArgumentException("minSubwordSize cannot be negative");
  }
  this.minSubwordSize=minSubwordSize;
  if (maxSubwordSize < 0) {
    throw new IllegalArgumentException("maxSubwordSize cannot be negative");
  }
  this.maxSubwordSize=maxSubwordSize;
  this.onlyLongestMatch=onlyLongestMatch;
  this.dictionary = dictionary;
}

Source File: AutoPhrasingTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

6 votes

public void testIncompletePhrase() throws Exception {
    final CharArraySet phraseSets = new CharArraySet(Arrays.asList(
            "big apple", "new york city", "property tax", "three word phrase"), false);

    final String input = "some new york";

    StringReader reader = new StringReader(input);
    final WhitespaceTokenizer in = new WhitespaceTokenizer();
    in.setReader(reader);

    AutoPhrasingTokenFilter aptf = new AutoPhrasingTokenFilter(in, phraseSets, false);
    aptf.setReplaceWhitespaceWith('_');
    CharTermAttribute term = aptf.addAttribute(CharTermAttribute.class);
    aptf.reset();

    assertTrue(aptf.incrementToken());
    assertEquals("some", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("new", term.toString());
    assertTrue(aptf.incrementToken());
    assertEquals("york", term.toString());
}

Source File: TestKeepWordFilter.java From lucene-solr with Apache License 2.0

6 votes

/** blast some random strings through the analyzer */
public void testRandomStrings() throws Exception {
  final Set<String> words = new HashSet<>();
  words.add( "a" );
  words.add( "b" );
  
  Analyzer a = new Analyzer() {

    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream stream = new KeepWordFilter(tokenizer, new CharArraySet( words, true));
      return new TokenStreamComponents(tokenizer, stream);
    }
  };
  
  checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER);
  a.close();
}

Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testWordComponentWithLessThanMinimumLength() throws Exception {
  CharArraySet dict = makeDictionary("abc", "d", "efg");

  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader("abcdefg"));
  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
    tokenizer,
    dict,
    CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE,
    CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false);

// since "d" is shorter than the minimum subword size, it should not be added to the token stream
  assertTokenStreamContents(tf,
    new String[] { "abcdefg", "abc", "efg" },
    new int[] { 0, 0, 0},
    new int[] { 7, 7, 7},
    new int[] { 1, 0, 0}
    );
}

Source File: ArabicAnalyzerProvider.java From crate with Apache License 2.0

5 votes

ArabicAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    arabicAnalyzer = new ArabicAnalyzer(
        Analysis.parseStopWords(env, settings, ArabicAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    arabicAnalyzer.setVersion(version);
}

Source File: TestSuggestStopFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

public void testInform() throws Exception {
  ResourceLoader loader = new ClasspathResourceLoader(getClass());
  assertTrue("loader is null and it shouldn't be", loader != null);
  SuggestStopFilterFactory factory = createFactory(
      "words", "stop-1.txt",
      "ignoreCase", "true");
  CharArraySet words = factory.getStopWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue("words Size: " + words.size() + " is not: " + 2, words.size() == 2);
  assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);

  factory = createFactory("words", "stop-1.txt, stop-2.txt",
      "ignoreCase", "true");
  words = factory.getStopWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue("words Size: " + words.size() + " is not: " + 4, words.size() == 4);
  assertTrue(factory.isIgnoreCase() + " does not equal: " + true, factory.isIgnoreCase() == true);

  factory = createFactory("words", "stop-snowball.txt",
      "format", "snowball",
      "ignoreCase", "true");
  words = factory.getStopWords();
  assertEquals(8, words.size());
  assertTrue(words.contains("he"));
  assertTrue(words.contains("him"));
  assertTrue(words.contains("his"));
  assertTrue(words.contains("himself"));
  assertTrue(words.contains("she"));
  assertTrue(words.contains("her"));
  assertTrue(words.contains("hers"));
  assertTrue(words.contains("herself"));

  // defaults
  factory = createFactory();
  assertEquals(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET, factory.getStopWords());
  assertEquals(false, factory.isIgnoreCase());
}

Source File: TestJapaneseNumberFilter.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testName() throws IOException {
  // Test name that normalises to number
  assertAnalyzesTo(analyzer, "田中京一",
      new String[]{"田中", "10000000000000001"}, // 京一 is normalized to a number
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );

  // An analyzer that marks 京一 as a keyword
  Analyzer keywordMarkingAnalyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      CharArraySet set = new CharArraySet(1, false);
      set.add("京一");

      Tokenizer tokenizer = new JapaneseTokenizer(newAttributeFactory(), null, false, JapaneseTokenizer.Mode.SEARCH);
      return new TokenStreamComponents(tokenizer, new JapaneseNumberFilter(new SetKeywordMarkerFilter(tokenizer, set)));
    }
  };

  assertAnalyzesTo(keywordMarkingAnalyzer, "田中京一",
      new String[]{"田中", "京一"}, // 京一 is not normalized
      new int[]{0, 2},
      new int[]{2, 4},
      new int[]{1, 1}
  );
  keywordMarkingAnalyzer.close();
}

Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testDumbCompoundWordsSE() throws Exception {
  CharArraySet dict = makeDictionary("Bil", "Dörr", "Motor", "Tak", "Borr", "Slag", "Hammar",
      "Pelar", "Glas", "Ögon", "Fodral", "Bas", "Fiol", "Makare", "Gesäll",
      "Sko", "Vind", "Rute", "Torkare", "Blad");

  DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter(
      whitespaceMockTokenizer(
              "Bildörr Bilmotor Biltak Slagborr Hammarborr Pelarborr Glasögonfodral Basfiolsfodral Basfiolsfodralmakaregesäll Skomakare Vindrutetorkare Vindrutetorkarblad abba"),
      dict);

  assertTokenStreamContents(tf, new String[] { "Bildörr", "Bil", "dörr", "Bilmotor",
      "Bil", "motor", "Biltak", "Bil", "tak", "Slagborr", "Slag", "borr",
      "Hammarborr", "Hammar", "borr", "Pelarborr", "Pelar", "borr",
      "Glasögonfodral", "Glas", "ögon", "fodral", "Basfiolsfodral", "Bas",
      "fiol", "fodral", "Basfiolsfodralmakaregesäll", "Bas", "fiol",
      "fodral", "makare", "gesäll", "Skomakare", "Sko", "makare",
      "Vindrutetorkare", "Vind", "rute", "torkare", "Vindrutetorkarblad",
      "Vind", "rute", "blad", "abba" }, new int[] { 0, 0, 0, 8, 8, 8, 17,
      17, 17, 24, 24, 24, 33, 33, 33, 44, 44, 44, 54, 54, 54, 54, 69, 69, 69,
      69, 84, 84, 84, 84, 84, 84, 111, 111, 111, 121, 121, 121, 121, 137,
      137, 137, 137, 156 }, new int[] { 7, 7, 7, 16, 16, 16, 23, 23, 23, 32,
      32, 32, 43, 43, 43, 53, 53, 53, 68, 68, 68, 68, 83, 83, 83, 83, 110,
      110, 110, 110, 110, 110, 120, 120, 120, 136, 136, 136, 136, 155, 155, 155,
      155, 160 }, new int[] { 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1, 0, 0, 1,
      0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 1, 0, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0, 0, 1,
      0, 0, 0, 1 });
}

Source File: BengaliAnalyzerProvider.java From crate with Apache License 2.0

5 votes

BengaliAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new BengaliAnalyzer(
        Analysis.parseStopWords(env, settings, BengaliAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}

Source File: TestArabicAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testWithStemExclusionSet() throws IOException {
  CharArraySet set = new CharArraySet(asSet("ساهدهات"), false);
  ArabicAnalyzer a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهدهات" });
  a.close();
  
  a = new ArabicAnalyzer(CharArraySet.EMPTY_SET, CharArraySet.EMPTY_SET);
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
  assertAnalyzesTo(a, "كبيرة the quick ساهدهات", new String[] { "كبير","the", "quick", "ساهد" });
  a.close();
}

Source File: TestCommonGramsQueryFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

public void testCompleteGraph() throws Exception {
  CommonGramsQueryFilterFactory factory = (CommonGramsQueryFilterFactory) tokenFilterFactory("CommonGramsQuery");
  CharArraySet words = factory.getCommonWords();
  assertTrue("words is null and it shouldn't be", words != null);
  assertTrue(words.contains("the"));
  Tokenizer tokenizer = whitespaceMockTokenizer("testing the factory works");
  TokenStream stream = factory.create(tokenizer);
  assertGraphStrings(stream, "testing_the the_factory factory works");
}

Source File: QueryAutoStopWordAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
  Set<String> stopWords = stopWordsPerField.get(fieldName);
  if (stopWords == null) {
    return components;
  }
  StopFilter stopFilter = new StopFilter(components.getTokenStream(), 
      new CharArraySet(stopWords, false));
  return new TokenStreamComponents(components.getSource(), stopFilter);
}

Source File: FrenchAnalyzerProvider.java From crate with Apache License 2.0

5 votes

FrenchAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new FrenchAnalyzer(
        Analysis.parseStopWords(env, settings, FrenchAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}

Source File: TestFrenchAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testExclusionTableViaCtor() throws Exception {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("habitable");
  FrenchAnalyzer fa = new FrenchAnalyzer(
      CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
      "chist" });
  fa.close();

  fa = new FrenchAnalyzer( CharArraySet.EMPTY_SET, set);
  assertAnalyzesTo(fa, "habitable chiste", new String[] { "habitable",
      "chist" });
  fa.close();
}

Source File: TestPortugueseLightStemFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testKeyword() throws IOException {
  final CharArraySet exclusionSet = new CharArraySet( asSet("quilométricas"), false);
  Analyzer a = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenStream sink = new SetKeywordMarkerFilter(source, exclusionSet);
      return new TokenStreamComponents(source, new PortugueseLightStemFilter(sink));
    }
  };
  checkOneTerm(a, "quilométricas", "quilométricas");
  a.close();
}

Source File: TestGalicianAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

/** test use of exclusion set */
public void testExclude() throws IOException {
  CharArraySet exclusionSet = new CharArraySet( asSet("correspondente"), false);
  Analyzer a = new GalicianAnalyzer( 
      GalicianAnalyzer.getDefaultStopSet(), exclusionSet);
  checkOneTerm(a, "correspondente", "correspondente");
  checkOneTerm(a, "corresponderá", "correspond");
  a.close();
}

Source File: TestCapitalizationFilter.java From lucene-solr with Apache License 2.0

5 votes

static void assertCapitalizesToKeyword(String input, String expected,
    boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
    Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
    int maxTokenLength) throws IOException {
  final MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false);
  tokenizer.setReader(new StringReader(input));
  assertCapitalizesTo(tokenizer,
      new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix,
      minWordLength, maxWordCount, maxTokenLength);    
}

Source File: TestThaiAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testReusableTokenStream() throws Exception {
  ThaiAnalyzer analyzer = new ThaiAnalyzer(CharArraySet.EMPTY_SET);
  assertAnalyzesTo(analyzer, "", new String[] {});
  
  assertAnalyzesTo(
      analyzer,
      "การที่ได้ต้องแสดงว่างานดี",
      new String[] { "การ", "ที่", "ได้", "ต้อง", "แสดง", "ว่า", "งาน", "ดี"});
  
  assertAnalyzesTo(
      analyzer,
      "บริษัทชื่อ XY&Z - คุยกับ [email protected]",
      new String[] { "บริษัท", "ชื่อ", "xy", "z", "คุย", "กับ", "xyz", "demo.com" });
  analyzer.close();
}

Source File: LithuanianAnalyzerProvider.java From crate with Apache License 2.0

5 votes

LithuanianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new LithuanianAnalyzer(
        Analysis.parseStopWords(env, settings, LithuanianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}

Source File: TestCapitalizationFilter.java From lucene-solr with Apache License 2.0

5 votes

static void assertCapitalizesTo(Tokenizer tokenizer, String expected[],
    boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter,
    Collection<char[]> okPrefix, int minWordLength, int maxWordCount,
    int maxTokenLength) throws IOException {
  CapitalizationFilter filter = new CapitalizationFilter(tokenizer, onlyFirstWord, keep, 
      forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength);
  assertTokenStreamContents(filter, expected);    
}

Source File: GalicianAnalyzerProvider.java From crate with Apache License 2.0

5 votes

GalicianAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    analyzer = new GalicianAnalyzer(
        Analysis.parseStopWords(env, settings, GalicianAnalyzer.getDefaultStopSet()),
        Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)
    );
    analyzer.setVersion(version);
}

Source File: TestBrazilianAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet(1, true);
  set.add("Brasília");
  Tokenizer tokenizer = new LetterTokenizer();
  tokenizer.setReader(new StringReader("Brasília Brasilia"));
  BrazilianStemFilter filter = new BrazilianStemFilter(new SetKeywordMarkerFilter(new LowerCaseFilter(tokenizer), set));

  assertTokenStreamContents(filter, new String[] { "brasília", "brasil" });
}

Source File: TestWordDelimiterGraphFilter.java From lucene-solr with Apache License 2.0

5 votes

public void testProtectedWords() throws Exception {
  TokenStream tokens = new CannedTokenStream(new Token("foo17-bar", 0, 9),
                                             new Token("foo-bar", 0, 7));

  CharArraySet protectedWords = new CharArraySet(new HashSet<>(Arrays.asList("foo17-BAR")), true);
  WordDelimiterGraphFilter wdf = new WordDelimiterGraphFilter(tokens, GENERATE_WORD_PARTS | PRESERVE_ORIGINAL | CATENATE_ALL, protectedWords);
  assertGraphStrings(wdf,
                     "foo17-bar foo bar",
                     "foo17-bar foo-bar",
                     "foo17-bar foobar");
}

org.apache.lucene.analysis.CharArraySet Java Examples