org.apache.lucene.analysis.LowerCaseFilter Java Examples
The following examples show how to use
org.apache.lucene.analysis.LowerCaseFilter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CommonAnalysisPlugin.java From crate with Apache License 2.0 | 6 votes |
@Override public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() { List<PreConfiguredTokenizer> tokenizers = new ArrayList<>(); tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram", () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null)); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null)); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() { @Override public String name() { return "lowercase"; } @Override public TokenStream create(TokenStream tokenStream) { return new LowerCaseFilter(tokenStream); } })); // Temporary shim for aliases. TODO deprecate after they are moved tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null)); return tokenizers; }
Example #2
Source File: AnalysisModule.java From crate with Apache License 2.0 | 6 votes |
static Map<String, PreConfiguredTokenFilter> setupPreConfiguredTokenFilters(List<AnalysisPlugin> plugins) { NamedRegistry<PreConfiguredTokenFilter> preConfiguredTokenFilters = new NamedRegistry<>("pre-configured token_filter"); // Add filters available in lucene-core preConfiguredTokenFilters.register("lowercase", PreConfiguredTokenFilter.singleton("lowercase", true, LowerCaseFilter::new)); preConfiguredTokenFilters.register( "standard", PreConfiguredTokenFilter.singletonWithVersion("standard", false, (reader, version) -> { DEPRECATION_LOGGER.deprecatedAndMaybeLog("standard_deprecation", "The [standard] token filter is deprecated and will be removed in a future version."); return reader; })); /* Note that "stop" is available in lucene-core but it's pre-built * version uses a set of English stop words that are in * lucene-analyzers-common so "stop" is defined in the analysis-common * module. */ for (AnalysisPlugin plugin: plugins) { for (PreConfiguredTokenFilter filter : plugin.getPreConfiguredTokenFilters()) { preConfiguredTokenFilters.register(filter.getName(), filter); } } return unmodifiableMap(preConfiguredTokenFilters.getRegistry()); }
Example #3
Source File: PhraseCountQueryBuilder.java From pyramid with Apache License 2.0 | 6 votes |
protected Query doToQuery(QueryShardContext context) throws IOException { // Analyzer analyzer = context.getMapperService().searchAnalyzer(); Analyzer analyzer = new WhitespaceAnalyzer(); try (TokenStream source = analyzer.tokenStream(fieldName, value.toString())) { CachingTokenFilter stream = new CachingTokenFilter(new LowerCaseFilter(source)); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); if (termAtt == null) { return null; } List<CustomSpanTermQuery> clauses = new ArrayList<>(); stream.reset(); while (stream.incrementToken()) { Term term = new Term(fieldName, termAtt.getBytesRef()); clauses.add(new CustomSpanTermQuery(term)); } return new PhraseCountQuery(clauses.toArray(new CustomSpanTermQuery[clauses.size()]), slop, inOrder, weightedCount); } catch (IOException e) { throw new RuntimeException("Error analyzing query text", e); } }
Example #4
Source File: TestTeeSinkTokenFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testMultipleSources() throws Exception { final TeeSinkTokenFilter tee1 = new TeeSinkTokenFilter(whitespaceMockTokenizer(buffer1.toString())); final TokenStream source1 = new CachingTokenFilter(tee1); tee1.addAttribute(CheckClearAttributesAttribute.class); MockTokenizer tokenizer = new MockTokenizer(tee1.getAttributeFactory(), MockTokenizer.WHITESPACE, false); tokenizer.setReader(new StringReader(buffer2.toString())); final TeeSinkTokenFilter tee2 = new TeeSinkTokenFilter(tokenizer); final TokenStream source2 = tee2; assertTokenStreamContents(source1, tokens1); assertTokenStreamContents(source2, tokens2); TokenStream lowerCasing = new LowerCaseFilter(source1); String[] lowerCaseTokens = new String[tokens1.length]; for (int i = 0; i < tokens1.length; i++) lowerCaseTokens[i] = tokens1[i].toLowerCase(Locale.ROOT); assertTokenStreamContents(lowerCasing, lowerCaseTokens); }
Example #5
Source File: TestCharTokenizers.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testReadSupplementaryChars() throws IOException { StringBuilder builder = new StringBuilder(); // create random input int num = 1024 + random().nextInt(1024); num *= RANDOM_MULTIPLIER; for (int i = 1; i < num; i++) { builder.append("\ud801\udc1cabc"); if((i % 10) == 0) builder.append(" "); } // internal buffer size is 1024 make sure we have a surrogate pair right at the border builder.insert(1023, "\ud801\udc1c"); Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString())); assertTokenStreamContents(new LowerCaseFilter(tokenizer), builder.toString().toLowerCase(Locale.ROOT).split(" ")); }
Example #6
Source File: SearchService.java From subsonic with GNU General Public License v3.0 | 6 votes |
@Override public TokenStream reusableTokenStream(String fieldName, Reader reader) throws IOException { class SavedStreams { StandardTokenizer tokenStream; TokenStream filteredTokenStream; } SavedStreams streams = (SavedStreams) getPreviousTokenStream(); if (streams == null) { streams = new SavedStreams(); setPreviousTokenStream(streams); streams.tokenStream = new StandardTokenizer(LUCENE_VERSION, reader); streams.filteredTokenStream = new StandardFilter(streams.tokenStream); streams.filteredTokenStream = new LowerCaseFilter(streams.filteredTokenStream); streams.filteredTokenStream = new StopFilter(true, streams.filteredTokenStream, STOP_WORDS_SET); streams.filteredTokenStream = new ASCIIFoldingFilter(streams.filteredTokenStream); } else { streams.tokenStream.reset(reader); } streams.tokenStream.setMaxTokenLength(DEFAULT_MAX_TOKEN_LENGTH); return streams.filteredTokenStream; }
Example #7
Source File: TestAnalyzers.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Test that LowercaseFilter handles the lowercasing correctly if the term * buffer has a trailing surrogate character leftover and the current term in * the buffer ends with a corresponding leading surrogate. */ public void testLowerCaseFilterLowSurrogateLeftover() throws IOException { // test if the limit of the termbuffer is correctly used with supplementary // chars WhitespaceTokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("BogustermBogusterm\udc16")); LowerCaseFilter filter = new LowerCaseFilter(tokenizer); assertTokenStreamContents(filter, new String[] {"bogustermbogusterm\udc16"}); filter.reset(); String highSurEndingUpper = "BogustermBoguster\ud801"; String highSurEndingLower = "bogustermboguster\ud801"; tokenizer.setReader(new StringReader(highSurEndingUpper)); assertTokenStreamContents(filter, new String[] {highSurEndingLower}); assertTrue(filter.hasAttribute(CharTermAttribute.class)); char[] termBuffer = filter.getAttribute(CharTermAttribute.class).buffer(); int length = highSurEndingLower.length(); assertEquals('\ud801', termBuffer[length - 1]); }
Example #8
Source File: NGramAnalyzer.java From uyuni with GNU General Public License v2.0 | 5 votes |
/** * @param fieldName ignored param * @param reader contains data to parse * @return TokenStream of ngrams */ public TokenStream tokenStream(String fieldName, Reader reader) { return new NGramTokenFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader))), min_ngram, max_ngram); }
Example #9
Source File: TestGermanStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public void setUp() throws Exception { super.setUp(); analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.KEYWORD, false); return new TokenStreamComponents(t, new GermanStemFilter(new LowerCaseFilter(t))); } }; }
Example #10
Source File: TestGermanAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testWithKeywordAttribute() throws IOException { CharArraySet set = new CharArraySet( 1, true); set.add("fischen"); final Tokenizer in = new LetterTokenizer(); in.setReader(new StringReader("Fischen Trinken")); GermanStemFilter filter = new GermanStemFilter( new SetKeywordMarkerFilter(new LowerCaseFilter(in), set)); assertTokenStreamContents(filter, new String[] { "fischen", "trink" }); }
Example #11
Source File: TestCharTokenizers.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testExtendCharBuffer() throws IOException { for (int i = 0; i < 40; i++) { StringBuilder builder = new StringBuilder(); for (int j = 0; j < 1+i; j++) { builder.append("a"); } builder.append("\ud801\udc1cabc"); Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString())); assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT)}); } }
Example #12
Source File: TestCharTokenizers.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testMaxWordLength() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 255; i++) { builder.append("A"); } Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); }
Example #13
Source File: TestCharTokenizers.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testMaxWordLengthWithSupplementary() throws IOException { StringBuilder builder = new StringBuilder(); for (int i = 0; i < 254; i++) { builder.append("A"); } builder.append("\ud801\udc1c"); Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(builder.toString() + builder.toString())); assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)}); }
Example #14
Source File: AsIsAnalyzer.java From yes-cart with Apache License 2.0 | 5 votes |
@Override protected TokenStreamComponents createComponents(final String fieldName) { if (toLowerCase) { final UnicodeWhitespaceTokenizer tokenizer = new UnicodeWhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); } return new TokenStreamComponents(new UnicodeWhitespaceTokenizer()); }
Example #15
Source File: NGramAnalyzer.java From spacewalk with GNU General Public License v2.0 | 5 votes |
/** * @param fieldName ignored param * @param reader contains data to parse * @return TokenStream of ngrams */ public TokenStream tokenStream(String fieldName, Reader reader) { return new NGramTokenFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader))), min_ngram, max_ngram); }
Example #16
Source File: LowerCaseTokenFilterFactory.java From crate with Apache License 2.0 | 5 votes |
@Override public TokenStream create(TokenStream tokenStream) { if (lang == null) { return new LowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("greek")) { return new GreekLowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("irish")) { return new IrishLowerCaseFilter(tokenStream); } else if (lang.equalsIgnoreCase("turkish")) { return new TurkishLowerCaseFilter(tokenStream); } else { throw new IllegalArgumentException("language [" + lang + "] not support for lower case"); } }
Example #17
Source File: TestAnalyzers.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = random().nextBoolean() ? new WhitespaceTokenizer() : new UnicodeWhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); }
Example #18
Source File: QueryParserTestBase.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new MockCollationFilter(new LowerCaseFilter(in)); }
Example #19
Source File: StandardAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); }
Example #20
Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0 | 4 votes |
@NotNull private static TokenFilter defaultTokenFilter(@NotNull Tokenizer source) { TokenFilter filteredSource = new LowerCaseFilter(source); return new WordDelimiterGraphFilter(filteredSource, SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, null); }
Example #21
Source File: CustomWhitespaceAnalyzer.java From gravitee-management-rest-api with Apache License 2.0 | 4 votes |
@Override protected TokenStreamComponents createComponents(String fieldName) { final Tokenizer tokenizer = new WhitespaceTokenizer(); return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); }
Example #22
Source File: CzechAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); }
Example #23
Source File: PortugueseAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); }
Example #24
Source File: StopAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); }
Example #25
Source File: SimpleAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); }
Example #26
Source File: SimpleAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStreamComponents createComponents(final String fieldName) { Tokenizer tokenizer = new LetterTokenizer(); return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); }
Example #27
Source File: SpanishAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); }
Example #28
Source File: HungarianAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); }
Example #29
Source File: CharAnalyzer.java From tephra with MIT License | 4 votes |
@Override protected TokenStreamComponents createComponents(final String fieldName) { Tokenizer tokenizer = new CharTokenizer(); return new TokenStreamComponents(tokenizer, new LowerCaseFilter(tokenizer)); }
Example #30
Source File: CharAnalyzer.java From tephra with MIT License | 4 votes |
@Override protected TokenStream normalize(String fieldName, TokenStream in) { return new LowerCaseFilter(in); }