org.apache.lucene.analysis.shingle.ShingleFilter Java Examples
The following examples show how to use
org.apache.lucene.analysis.shingle.ShingleFilter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestConditionalTokenFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testReadaheadWithNoFiltering() throws IOException { Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new ClassicTokenizer(); TokenStream sink = new ConditionalTokenFilter(source, in -> new ShingleFilter(in, 2)) { @Override protected boolean shouldFilter() throws IOException { return true; } }; return new TokenStreamComponents(source, sink); } }; String input = "one two three four"; try (TokenStream ts = analyzer.tokenStream("", input)) { assertTokenStreamContents(ts, new String[]{ "one", "one two", "two", "two three", "three", "three four", "four" }); } }
Example #2
Source File: PhrasesIdentificationComponent.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Helper method, public for testing purposes only. * <p> * Given an analyzer, inspects it to determine if: * <ul> * <li>it is a {@link TokenizerChain}</li> * <li>it contains exactly one instance of {@link ShingleFilterFactory}</li> * </ul> * <p> * If these these conditions are met, then this method returns the <code>maxShingleSize</code> * in effect for this analyzer, otherwise returns -1. * </p> * * @param analyzer An analyzer inspect * @return <code>maxShingleSize</code> if available * @lucene.internal */ public static int getMaxShingleSize(Analyzer analyzer) { if (!TokenizerChain.class.isInstance(analyzer)) { return -1; } final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories(); if (0 == factories.length) { return -1; } int result = -1; for (TokenFilterFactory tff : factories) { if (ShingleFilterFactory.class.isInstance(tff)) { if (0 < result) { // more then one shingle factory in our analyzer, which is weird, so make no assumptions... return -1; } // would be nice if there was an easy way to just ask a factory for the effective value // of an arguement... final Map<String,String> args = tff.getOriginalArgs(); result = args.containsKey("maxShingleSize") ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE; } } return result; }
Example #3
Source File: ShingleTokenFilterFactory.java From crate with Apache License 2.0 | 6 votes |
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) { super(indexSettings, name, settings); int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff(); Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true); Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false); int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0); if (shingleDiff > maxAllowedShingleDiff) { deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter," + "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]"); } String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN); factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken); }
Example #4
Source File: ShingleTokenFilterFactory.java From crate with Apache License 2.0 | 6 votes |
@Override public TokenStream create(TokenStream tokenStream) { ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize); filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); filter.setTokenSeparator(tokenSeparator); filter.setFillerToken(fillerToken); if (outputUnigrams || (minShingleSize != maxShingleSize)) { /** * We disable the graph analysis on this token stream * because it produces shingles of different size. * Graph analysis on such token stream is useless and dangerous as it may create too many paths * since shingles of different size are not aligned in terms of positions. */ filter.addAttribute(DisableGraphAttribute.class); } return filter; }
Example #5
Source File: ShingleTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public ShingleTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE); Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE); Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true); Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false); String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR); String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN); factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken); }
Example #6
Source File: ShingleTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Override public TokenStream create(TokenStream tokenStream) { ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize); filter.setOutputUnigrams(outputUnigrams); filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles); filter.setTokenSeparator(tokenSeparator); filter.setFillerToken(fillerToken); return filter; }
Example #7
Source File: TestConditionalTokenFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testReadaheadWithFiltering() throws IOException { CharArraySet protectedTerms = new CharArraySet(2, true); protectedTerms.add("three"); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new ClassicTokenizer(); TokenStream sink = new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2)); sink = new ValidatingTokenFilter(sink, "1"); return new TokenStreamComponents(source, sink); } }; String input = "one two three four"; try (TokenStream ts = analyzer.tokenStream("", input)) { assertTokenStreamContents(ts, new String[]{ "one", "one two", "two", "three", "four" }, new int[]{ 0, 0, 4, 8, 14 }, new int[]{ 3, 7, 7, 13, 18 }, new int[]{ 1, 0, 1, 1, 1 }, new int[]{ 1, 2, 1, 1, 1 }, 18); } }
Example #8
Source File: TestConditionalTokenFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testFilteringWithReadahead() throws IOException { CharArraySet protectedTerms = new CharArraySet(2, true); protectedTerms.add("two"); protectedTerms.add("two three"); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new StandardTokenizer(); TokenStream sink = new ShingleFilter(source, 3); sink = new ProtectedTermFilter(protectedTerms, sink, in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true)); return new TokenStreamComponents(source, sink); } }; String input = "one two three four"; try (TokenStream ts = analyzer.tokenStream("", input)) { assertTokenStreamContents(ts, new String[]{ "two", "two three" }, new int[]{ 4, 4 }, new int[]{ 7, 13 }, new int[]{ 2, 0 }, new int[]{ 1, 2 }, 18); } }
Example #9
Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0 | 5 votes |
@NotNull private static Analyzer createShingleAnalyzer(int maxShingles) { return new Analyzer() { @Override protected TokenStreamComponents createComponents(@NotNull String field) { Tokenizer source = new WhitespaceTokenizer(); source.setReader(new StringReader(field)); ShingleFilter shingleFilter = new ShingleFilter(defaultTokenFilter(source), maxShingles); shingleFilter.setOutputUnigrams(true); return new TokenStreamComponents(source, shingleFilter); } }; }
Example #10
Source File: ContentAnalyzer.java From modernmt with Apache License 2.0 | 5 votes |
@Override protected TokenStreamComponents createComponents(String fieldName, Reader reader) { Tokenizer tokenizer = new WhitespaceTokenizer(reader); TokenStream filter; filter = new PunctuationFilter(tokenizer); if (shingleSize > 0) { ShingleFilter shingleFilter = new ShingleFilter(filter, shingleSize, shingleSize); shingleFilter.setOutputUnigrams(outputUnigrams); filter = shingleFilter; } return new TokenStreamComponents(tokenizer, filter); }
Example #11
Source File: ConcatenateFilter.java From SolrTextTagger with Apache License 2.0 | 5 votes |
@Override public final boolean incrementToken() throws IOException { if (done) return false; done = true; buf.setLength(0); boolean firstTerm = true; while (input.incrementToken()) { if (!firstTerm) { buf.append(separator); } //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13 buf.append(termAtt); firstTerm = false; } input.end();//call here so we can see end of stream offsets termAtt.setEmpty().append(buf); //Setting the other attributes ultimately won't have much effect but lets be thorough offsetAtt.setOffset(0, offsetAtt.endOffset()); posIncrAtt.setPositionIncrement(1); posLenAtt.setPositionLength(1);//or do we add up the positions? Probably not used any way. typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle" return true; }
Example #12
Source File: CommonAnalysisPlugin.java From crate with Apache License 2.0 | 5 votes |
@Override public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() { List<PreConfiguredTokenFilter> filters = new ArrayList<>(); filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new)); filters.add(PreConfiguredTokenFilter.singleton( "common_grams", false, input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET))); filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input -> new DelimitedPayloadTokenFilter(input, DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER, DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER))); filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton( "edge_ngram", false, input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL))); filters.add(PreConfiguredTokenFilter.singleton( "elision", true, input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES)) ); filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer()))); filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("length", false, input -> new LengthFilter(input, 0, Integer.MAX_VALUE))); // TODO this one seems useless filters.add(PreConfiguredTokenFilter.singleton( "limit", false, input -> new LimitTokenCountFilter( input, LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT, LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS) ) ); filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false))); filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian"))); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> { TokenStream ts = new ShingleFilter(input); /** * We disable the graph analysis on this token stream * because it produces shingles of different size. * Graph analysis on such token stream is useless and dangerous as it may create too many paths * since shingles of different size are not aligned in terms of positions. */ ts.addAttribute(DisableGraphAttribute.class); return ts; })); filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English"))); filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new)); // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET))); filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10))); filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new)); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input -> new WordDelimiterFilter(input, WordDelimiterFilter.GENERATE_WORD_PARTS | WordDelimiterFilter.GENERATE_NUMBER_PARTS | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterFilter.SPLIT_ON_NUMERICS | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null))); filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input -> new WordDelimiterGraphFilter(input, WordDelimiterGraphFilter.GENERATE_WORD_PARTS | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null))); return filters; }
Example #13
Source File: ShingleTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 4 votes |
public Factory(String name) { this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN); }
Example #14
Source File: LenientImapSearchAnalyzer.java From james-project with Apache License 2.0 | 4 votes |
@Override public TokenStream tokenStream(String arg0, Reader reader) { return new ShingleFilter(new UpperCaseFilter(new WhitespaceTokenizer(Version.LUCENE_31, reader)), 2, maxTokenLength); }
Example #15
Source File: ShingleTokenFilterFactory.java From crate with Apache License 2.0 | 4 votes |
public Factory(String name) { this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN); }