org.apache.lucene.analysis.ngram.NGramTokenFilter Java Examples
The following examples show how to use
org.apache.lucene.analysis.ngram.NGramTokenFilter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: NGramAnalyzer.java From uyuni with GNU General Public License v2.0 | 5 votes |
/** * @param fieldName ignored param * @param reader contains data to parse * @return TokenStream of ngrams */ public TokenStream tokenStream(String fieldName, Reader reader) { return new NGramTokenFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader))), min_ngram, max_ngram); }
Example #2
Source File: EdgeNGramTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 5 votes |
EdgeNGramTokenFilterFactory(Index index, Settings indexSettings, String name, Settings settings) { super(index, indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); this.side = parseSide(settings.get("side", "front")); this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings); }
Example #3
Source File: NGramTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 5 votes |
@SuppressWarnings("deprecation") @Override public TokenStream create(TokenStream tokenStream) { final Version version = this.version == Version.LUCENE_4_3 ? Version.LUCENE_4_4 : this.version; // we supported it since 4.3 if (version.onOrAfter(Version.LUCENE_4_3)) { return new NGramTokenFilter(tokenStream, minGram, maxGram); } else { return new Lucene43NGramTokenFilter(tokenStream, minGram, maxGram); } }
Example #4
Source File: NGramAnalyzer.java From spacewalk with GNU General Public License v2.0 | 5 votes |
/** * @param fieldName ignored param * @param reader contains data to parse * @return TokenStream of ngrams */ public TokenStream tokenStream(String fieldName, Reader reader) { return new NGramTokenFilter( new LowerCaseFilter( new StandardFilter( new StandardTokenizer(reader))), min_ngram, max_ngram); }
Example #5
Source File: CommonAnalysisPlugin.java From crate with Apache License 2.0 | 5 votes |
@Override public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { List<PreConfiguredTokenFilter> filters = new ArrayList<>(); filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); filters.add(PreConfiguredTokenFilter.singleton( "common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton( "edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL))); filters.add(PreConfiguredTokenFilter.singleton( "elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)) ); filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless filters.add(PreConfiguredTokenFilter.singleton( "limit", false, input -> new LimitTokenCountFilter( input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS) ) ); filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false))); filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> { TokenStream ts = new ShingleFilter(input); /** * We disable the graph analysis on this token stream * because it produces shingles of different size. * Graph analysis on such token stream is useless and dangerous as it may create too many paths * since shingles of different size are not aligned in terms of positions. */ ts.addAttribute(DisableGraphAttribute.class); return ts; })); filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; }
Example #6
Source File: NGramTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 4 votes |
@Inject public NGramTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenFilter.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenFilter.DEFAULT_MAX_NGRAM_SIZE); }
Example #7
Source File: StrictImapSearchAnalyzer.java From james-project with Apache License 2.0 | 4 votes |
/** * @see org.apache.lucene.analysis.Analyzer#tokenStream(java.lang.String, java.io.Reader) */ @Override public TokenStream tokenStream(String fieldName, Reader reader) { return new NGramTokenFilter(new UpperCaseFilter(new SentenceTokenizer(reader)), minTokenLength, maxTokenLength); }
Example #8
Source File: NGramTokenFilterFactory.java From crate with Apache License 2.0 | 4 votes |
@Override public TokenStream create(TokenStream tokenStream) { return new NGramTokenFilter(tokenStream, minGram, maxGram, false); }