org.apache.lucene.analysis.LowerCaseFilter Java Examples

The following examples show how to use org.apache.lucene.analysis.LowerCaseFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CommonAnalysisPlugin.java    From crate with Apache License 2.0 6 votes vote down vote up
@Override
public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() {
    List<PreConfiguredTokenizer> tokenizers = new ArrayList<>();
    tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram",
        () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null));
    tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null));
    tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() {
        @Override
        public String name() {
            return "lowercase";
        }

        @Override
        public TokenStream create(TokenStream tokenStream) {
            return new LowerCaseFilter(tokenStream);
        }
    }));

    // Temporary shim for aliases. TODO deprecate after they are moved
    tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null));

    return tokenizers;
}
 
Example #2
Source File: AnalysisModule.java    From crate with Apache License 2.0 6 votes vote down vote up
static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) {
    NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter");

    // Add filters available in lucene-core
    preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new));
    preConfiguredTokenFilters.register(
        "standard",
        PreConfiguredTokenFilter.singletonWithVersion("standard", false, (reader, version) -> {
            DEPRECATION_LOGGER.deprecatedAndMaybeLog("standard_deprecation",
                "The [standard] token filter is deprecated and will be removed in a future version.");
            return reader;
        }));
    /* Note that "stop" is available in lucene-core but it's pre-built
     * version uses a set of English stop words that are in
     * lucene-analyzers-common so "stop" is defined in the analysis-common
     * module. */

    for (AnalysisPlugin plugin: plugins) {
        for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) {
            preConfiguredTokenFilters.register(filter.getName(), filter);
        }
    }
    return unmodifiableMap(preConfiguredTokenFilters.getRegistry());
}
 
Example #3
Source File: PhraseCountQueryBuilder.java    From pyramid with Apache License 2.0 6 votes vote down vote up
protected Query doToQuery(QueryShardContext context) throws IOException {
//        Analyzer analyzer = context.getMapperService().searchAnalyzer();
        Analyzer analyzer = new WhitespaceAnalyzer();
        try (TokenStream source = analyzer.tokenStream(fieldName, value.toString())) {
            CachingTokenFilter stream = new CachingTokenFilter(new LowerCaseFilter(source));
            TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class);
            if (termAtt == null) {
                return null;
            }
            List<CustomSpanTermQuery> clauses = new ArrayList<>();
            stream.reset();
            while (stream.incrementToken()) {
                Term term = new Term(fieldName, termAtt.getBytesRef());
                    clauses.add(new CustomSpanTermQuery(term));
            }
            return new PhraseCountQuery(clauses.toArray(new CustomSpanTermQuery[clauses.size()]), slop, inOrder, weightedCount);
        } catch (IOException e) {
            throw new RuntimeException("Error analyzing query text", e);
        }


    }
 
Example #4
Source File: TestTeeSinkTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMultipleSources() throws Exception {
  final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString()));
  final TokenStream source1 = new CachingTokenFilter(tee1);

  tee1.addAttribute(CheckClearAttributesAttribute.class);

  MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false);
  tokenizer.setReader(new StringReader(buffer2.toString()));
  final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer);
  final TokenStream source2 = tee2;

  assertTokenStreamContents(source1, tokens1);
  assertTokenStreamContents(source2, tokens2);

  TokenStream lowerCasing = new LowerCaseFilter(source1);
  String[] lowerCaseTokens = new String[tokens1.length];
  for (int i = 0; i < tokens1.length; i++)
    lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT);
  assertTokenStreamContents(lowerCasing, lowerCaseTokens);
}
 
Example #5
Source File: TestCharTokenizers.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testReadSupplementaryChars() throws IOException {
  StringBuilder builder = new StringBuilder();
  // create random input
  int num = 1024 + random().nextInt(1024);
  num *= RANDOM_MULTIPLIER;
  for (int i = 1; i < num; i++) {
    builder.append("\ud801\udc1cabc");
    if((i % 10) == 0)
      builder.append(" ");
  }
  // internal buffer size is 1024 make sure we have a surrogate pair right at the border
  builder.insert(1023, "\ud801\udc1c");
  Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(builder.toString()));
  assertTokenStreamContents(new LowerCaseFilter(tokenizer), builder.toString().toLowerCase(Locale.ROOT).split(" "));
}
 
Example #6
Source File: SearchService.java    From subsonic with GNU General Public License v3.0 6 votes vote down vote up
@Override
public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException {
    class SavedStreams {
        StandardTokenizer tokenStream;
        TokenStream filteredTokenStream;
    }

    SavedStreams streams = (SavedStreams) getPreviousTokenStream();
    if (streams == null) {
        streams = new SavedStreams();
        setPreviousTokenStream(streams);
        streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader);
        streams.filteredTokenStream = new StandardFilter(streams.tokenStream);
        streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream);
        streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET);
        streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream);
    } else {
        streams.tokenStream.reset(reader);
    }
    streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH);

    return streams.filteredTokenStream;
}
 
Example #7
Source File: TestAnalyzers.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/**
 * Test that LowercaseFilter handles the lowercasing correctly if the term
 * buffer has a trailing surrogate character leftover and the current term in
 * the buffer ends with a corresponding leading surrogate.
 */
public void testLowerCaseFilterLowSurrogateLeftover() throws IOException {
  // test if the limit of the termbuffer is correctly used with supplementary
  // chars
  WhitespaceTokenizer tokenizer = new WhitespaceTokenizer();
  tokenizer.setReader(new StringReader("BogustermBogusterm\udc16"));
  LowerCaseFilter filter = new LowerCaseFilter(tokenizer);
  assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"});
  filter.reset();
  String highSurEndingUpper = "BogustermBoguster\ud801";
  String highSurEndingLower = "bogustermboguster\ud801";
  tokenizer.setReader(new StringReader(highSurEndingUpper));
  assertTokenStreamContents(filter, new String[] {highSurEndingLower});
  assertTrue(filter.hasAttribute(CharTermAttribute.class));
  char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer();
  int length = highSurEndingLower.length();
  assertEquals('\ud801', termBuffer[length - 1]);
}
 
Example #8
Source File: NGramAnalyzer.java    From uyuni with GNU General Public License v2.0 5 votes vote down vote up
/**
 * @param fieldName ignored param
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(
            new LowerCaseFilter(
                new StandardFilter(
                    new StandardTokenizer(reader))), min_ngram, max_ngram);
}
 
Example #9
Source File: TestGermanStemFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public void setUp() throws Exception {
  super.setUp();
  analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer t = new MockTokenizer(MockTokenizer.KEYWORD, false);
      return new TokenStreamComponents(t,
          new GermanStemFilter(new LowerCaseFilter(t)));
    }
  };
}
 
Example #10
Source File: TestGermanAnalyzer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testWithKeywordAttribute() throws IOException {
  CharArraySet set = new CharArraySet( 1, true);
  set.add("fischen");
  final Tokenizer in = new LetterTokenizer();
  in.setReader(new StringReader("Fischen Trinken"));
  GermanStemFilter filter = new GermanStemFilter(
      new SetKeywordMarkerFilter(new LowerCaseFilter(in), set));
  assertTokenStreamContents(filter, new String[] { "fischen", "trink" });
}
 
Example #11
Source File: TestCharTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testExtendCharBuffer() throws IOException {
  for (int i = 0; i < 40; i++) {
    StringBuilder builder = new StringBuilder();
    for (int j = 0; j < 1+i; j++) {
      builder.append("a");
    }
    builder.append("\ud801\udc1cabc");
    Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
    tokenizer.setReader(new StringReader(builder.toString()));
    assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT)});
  }
}
 
Example #12
Source File: TestCharTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testMaxWordLength() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 255; i++) {
    builder.append("A");
  }
  Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
 
Example #13
Source File: TestCharTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testMaxWordLengthWithSupplementary() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 254; i++) {
    builder.append("A");
  }
  builder.append("\ud801\udc1c");
  Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
 
Example #14
Source File: AsIsAnalyzer.java    From yes-cart with Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    if (toLowerCase) {
        final UnicodeWhitespaceTokenizer tokenizer = new UnicodeWhitespaceTokenizer();
        return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
    }
    return new TokenStreamComponents(new UnicodeWhitespaceTokenizer());
}
 
Example #15
Source File: NGramAnalyzer.java    From spacewalk with GNU General Public License v2.0 5 votes vote down vote up
/**
 * @param fieldName ignored param
 * @param reader contains data to parse
 * @return TokenStream of ngrams
 */
public TokenStream tokenStream(String fieldName, Reader reader) {
    return new NGramTokenFilter(
            new LowerCaseFilter(
                new StandardFilter(
                    new StandardTokenizer(reader))), min_ngram, max_ngram);
}
 
Example #16
Source File: LowerCaseTokenFilterFactory.java    From crate with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    if (lang == null) {
        return new LowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("greek")) {
        return new GreekLowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("irish")) {
        return new IrishLowerCaseFilter(tokenStream);
    } else if (lang.equalsIgnoreCase("turkish")) {
        return new TurkishLowerCaseFilter(tokenStream);
    } else {
        throw new IllegalArgumentException("language [" + lang + "] not support for lower case");
    }
}
 
Example #17
Source File: TestAnalyzers.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = random().nextBoolean() ? new WhitespaceTokenizer() : new UnicodeWhitespaceTokenizer();
  return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
 
Example #18
Source File: QueryParserTestBase.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new MockCollationFilter(new LowerCaseFilter(in));
}
 
Example #19
Source File: StandardAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example #20
Source File: TreatmentCurator.java    From hmftools with GNU General Public License v3.0 4 votes vote down vote up
@NotNull
private static TokenFilter defaultTokenFilter(@NotNull Tokenizer source) {
    TokenFilter filteredSource = new LowerCaseFilter(source);
    return new WordDelimiterGraphFilter(filteredSource, SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, null);
}
 
Example #21
Source File: CustomWhitespaceAnalyzer.java    From gravitee-management-rest-api with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName) {
    final Tokenizer tokenizer = new WhitespaceTokenizer();
    return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
 
Example #22
Source File: CzechAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example #23
Source File: PortugueseAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example #24
Source File: StopAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example #25
Source File: SimpleAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example #26
Source File: SimpleAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
  Tokenizer tokenizer = new LetterTokenizer();
  return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
 
Example #27
Source File: SpanishAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example #28
Source File: HungarianAnalyzer.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  return new LowerCaseFilter(in);
}
 
Example #29
Source File: CharAnalyzer.java    From tephra with MIT License 4 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(final String fieldName) {
    Tokenizer tokenizer = new CharTokenizer();
    return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer));
}
 
Example #30
Source File: CharAnalyzer.java    From tephra with MIT License 4 votes vote down vote up
@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
    return new LowerCaseFilter(in);
}