org.apache.lucene.analysis.shingle.ShingleFilter Java Examples

The following examples show how to use org.apache.lucene.analysis.shingle.ShingleFilter. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestConditionalTokenFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testReadaheadWithNoFiltering() throws IOException {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer source = new ClassicTokenizer();
      TokenStream sink = new ConditionalTokenFilter(source, in -> new ShingleFilter(in, 2)) {
        @Override
        protected boolean shouldFilter() throws IOException {
          return true;
        }
      };
      return new TokenStreamComponents(source, sink);
    }
  };

  String input = "one two three four";

  try (TokenStream ts = analyzer.tokenStream("", input)) {
    assertTokenStreamContents(ts, new String[]{
        "one", "one two",
        "two", "two three",
        "three", "three four",
        "four"
    });
  }
}
 
Example #2
Source File: PhrasesIdentificationComponent.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
/** 
 * Helper method, public for testing purposes only.
 * <p>
 * Given an analyzer, inspects it to determine if:
 * <ul>
 *  <li>it is a {@link TokenizerChain}</li>
 *  <li>it contains exactly one instance of {@link ShingleFilterFactory}</li>
 * </ul>
 * <p>
 * If these these conditions are met, then this method returns the <code>maxShingleSize</code> 
 * in effect for this analyzer, otherwise returns -1.
 * </p>
 * 
 * @param analyzer An analyzer inspect
 * @return <code>maxShingleSize</code> if available
 * @lucene.internal
 */
public static int getMaxShingleSize(Analyzer analyzer) {
  if (!TokenizerChain.class.isInstance(analyzer)) {
    return -1;
  }
  
  final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories();
  if (0 == factories.length) {
    return -1;
  }
  int result = -1;
  for (TokenFilterFactory tff : factories) {
    if (ShingleFilterFactory.class.isInstance(tff)) {
      if (0 < result) {
        // more then one shingle factory in our analyzer, which is weird, so make no assumptions...
        return -1;
      }
      // would be nice if there was an easy way to just ask a factory for the effective value
      // of an arguement...
      final Map<String,String> args = tff.getOriginalArgs();
      result = args.containsKey("maxShingleSize")
        ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
    }
  }
  return result;
}
 
Example #3
Source File: ShingleTokenFilterFactory.java    From crate with Apache License 2.0 6 votes vote down vote up
public ShingleTokenFilterFactory(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    int maxAllowedShingleDiff = indexSettings.getMaxShingleDiff();
    Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
    Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
    Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
    Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);

    int shingleDiff = maxShingleSize - minShingleSize + (outputUnigrams ? 1 : 0);
    if (shingleDiff > maxAllowedShingleDiff) {
        deprecationLogger.deprecated("Deprecated big difference between maxShingleSize and minShingleSize in Shingle TokenFilter,"
            + "expected difference must be less than or equal to: [" + maxAllowedShingleDiff + "]");
    }
    String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
    String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
    factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
}
 
Example #4
Source File: ShingleTokenFilterFactory.java    From crate with Apache License 2.0 6 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
    filter.setOutputUnigrams(outputUnigrams);
    filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
    filter.setTokenSeparator(tokenSeparator);
    filter.setFillerToken(fillerToken);
    if (outputUnigrams || (minShingleSize != maxShingleSize)) {
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        filter.addAttribute(DisableGraphAttribute.class);
    }
    return filter;
}
 
Example #5
Source File: ShingleTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Inject
public ShingleTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    Integer maxShingleSize = settings.getAsInt("max_shingle_size", ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE);
    Integer minShingleSize = settings.getAsInt("min_shingle_size", ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE);
    Boolean outputUnigrams = settings.getAsBoolean("output_unigrams", true);
    Boolean outputUnigramsIfNoShingles = settings.getAsBoolean("output_unigrams_if_no_shingles", false);
    String tokenSeparator = settings.get("token_separator", ShingleFilter.DEFAULT_TOKEN_SEPARATOR);
    String fillerToken = settings.get("filler_token", ShingleFilter.DEFAULT_FILLER_TOKEN);
    factory = new Factory("shingle", minShingleSize, maxShingleSize, outputUnigrams, outputUnigramsIfNoShingles, tokenSeparator, fillerToken);
}
 
Example #6
Source File: ShingleTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Override
public TokenStream create(TokenStream tokenStream) {
    ShingleFilter filter = new ShingleFilter(tokenStream, minShingleSize, maxShingleSize);
    filter.setOutputUnigrams(outputUnigrams);
    filter.setOutputUnigramsIfNoShingles(outputUnigramsIfNoShingles);
    filter.setTokenSeparator(tokenSeparator);
    filter.setFillerToken(fillerToken);
    return filter;
}
 
Example #7
Source File: TestConditionalTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testReadaheadWithFiltering() throws IOException {

    CharArraySet protectedTerms = new CharArraySet(2, true);
    protectedTerms.add("three");

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer source = new ClassicTokenizer();
        TokenStream sink = new ProtectedTermFilter(protectedTerms, source, in -> new ShingleFilter(in, 2));
        sink = new ValidatingTokenFilter(sink, "1");
        return new TokenStreamComponents(source, sink);
      }
    };

    String input = "one two three four";

    try (TokenStream ts = analyzer.tokenStream("", input)) {
      assertTokenStreamContents(ts, new String[]{
          "one", "one two", "two", "three", "four"
      }, new int[]{
           0,     0,         4,     8,       14
      }, new int[]{
           3,     7,         7,     13,      18
      }, new int[]{
           1,     0,         1,     1,       1
      }, new int[]{
           1,     2,         1,     1,       1
      }, 18);
    }
  }
 
Example #8
Source File: TestConditionalTokenFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testFilteringWithReadahead() throws IOException {

    CharArraySet protectedTerms = new CharArraySet(2, true);
    protectedTerms.add("two");
    protectedTerms.add("two three");

    Analyzer analyzer = new Analyzer() {
      @Override
      protected TokenStreamComponents createComponents(String fieldName) {
        Tokenizer source = new StandardTokenizer();
        TokenStream sink = new ShingleFilter(source, 3);
        sink = new ProtectedTermFilter(protectedTerms, sink, in -> new TypeTokenFilter(in, Collections.singleton("ALL"), true));
        return new TokenStreamComponents(source, sink);
      }
    };

    String input = "one two three four";

    try (TokenStream ts = analyzer.tokenStream("", input)) {
      assertTokenStreamContents(ts, new String[]{
          "two", "two three"
      }, new int[]{
           4,     4
      }, new int[]{
           7,     13
      }, new int[]{
           2,     0
      }, new int[]{
           1,     2
      }, 18);
    }

  }
 
Example #9
Source File: TreatmentCurator.java    From hmftools with GNU General Public License v3.0 5 votes vote down vote up
@NotNull
private static Analyzer createShingleAnalyzer(int maxShingles) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            ShingleFilter shingleFilter = new ShingleFilter(defaultTokenFilter(source), maxShingles);
            shingleFilter.setOutputUnigrams(true);
            return new TokenStreamComponents(source, shingleFilter);
        }
    };
}
 
Example #10
Source File: ContentAnalyzer.java    From modernmt with Apache License 2.0 5 votes vote down vote up
@Override
protected TokenStreamComponents createComponents(String fieldName, Reader reader) {
    Tokenizer tokenizer = new WhitespaceTokenizer(reader);
    TokenStream filter;
    filter = new PunctuationFilter(tokenizer);

    if (shingleSize > 0) {
        ShingleFilter shingleFilter = new ShingleFilter(filter, shingleSize, shingleSize);
        shingleFilter.setOutputUnigrams(outputUnigrams);

        filter = shingleFilter;
    }

    return new TokenStreamComponents(tokenizer, filter);
}
 
Example #11
Source File: ConcatenateFilter.java    From SolrTextTagger with Apache License 2.0 5 votes vote down vote up
@Override
public final boolean incrementToken() throws IOException {
  if (done)
    return false;
  done = true;

  buf.setLength(0);
  boolean firstTerm = true;
  while (input.incrementToken()) {
    if (!firstTerm) {
      buf.append(separator);
    }
    //TODO consider indexing special chars when posInc > 1 (stop words). We ignore for now. #13
    buf.append(termAtt);
    firstTerm = false;
  }
  input.end();//call here so we can see end of stream offsets

  termAtt.setEmpty().append(buf);
  //Setting the other attributes ultimately won't have much effect but lets be thorough
  offsetAtt.setOffset(0, offsetAtt.endOffset());
  posIncrAtt.setPositionIncrement(1);
  posLenAtt.setPositionLength(1);//or do we add up the positions?  Probably not used any way.
  typeAtt.setType(ShingleFilter.DEFAULT_TOKEN_TYPE);//"shingle"

  return true;
}
 
Example #12
Source File: CommonAnalysisPlugin.java    From crate with Apache License 2.0 5 votes vote down vote up
@Override
public List<PreConfiguredTokenFilter> getPreConfiguredTokenFilters() {
    List<PreConfiguredTokenFilter> filters = new ArrayList<>();
    filters.add(PreConfiguredTokenFilter.singleton("apostrophe", false, ApostropheFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_normalization", true, ArabicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("arabic_stem", false, ArabicStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("asciifolding", true, ASCIIFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("bengali_normalization", true, BengaliNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("brazilian_stem", false, BrazilianStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_bigram", false, CJKBigramFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("cjk_width", true, CJKWidthFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("classic", false, ClassicFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton(
        "common_grams",
        false,
        input -> new CommonGramsFilter(input, CharArraySet.EMPTY_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("czech_stem", false, CzechStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("decimal_digit", true, DecimalDigitFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload_filter", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("delimited_payload", false, input ->
            new DelimitedPayloadTokenFilter(input,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_DELIMITER,
                    DelimitedPayloadTokenFilterFactory.DEFAULT_ENCODER)));
    filters.add(PreConfiguredTokenFilter.singleton("dutch_stem", false, input -> new SnowballFilter(input, new DutchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton(
        "edge_ngram",
        false,
        input -> new EdgeNGramTokenFilter(input, EdgeNGramTokenFilterFactory.SIDE_FRONT, EdgeNGramTokenFilterFactory.SIDE_BACK, EdgeNGramTokenFilter.DEFAULT_PRESERVE_ORIGINAL)));
    filters.add(PreConfiguredTokenFilter.singleton(
        "elision",
        true,
        input -> new ElisionFilter(input, FrenchAnalyzer.DEFAULT_ARTICLES))
    );
    filters.add(PreConfiguredTokenFilter.singleton("french_stem", false, input -> new SnowballFilter(input, new FrenchStemmer())));
    filters.add(PreConfiguredTokenFilter.singleton("german_normalization", true, GermanNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("german_stem", false, GermanStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("hindi_normalization", true, HindiNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("indic_normalization", true, IndicNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("keyword_repeat", false, KeywordRepeatFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("kstem", false, KStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("length", false, input ->
        new LengthFilter(input, 0, Integer.MAX_VALUE)));  // TODO this one seems useless
    filters.add(PreConfiguredTokenFilter.singleton(
        "limit",
        false,
        input -> new LimitTokenCountFilter(
            input,
            LimitTokenCountFilterFactory.DEFAULT_MAX_TOKEN_COUNT,
            LimitTokenCountFilterFactory.DEFAULT_CONSUME_ALL_TOKENS)
        )
    );
    filters.add(PreConfiguredTokenFilter.singleton("ngram", false, reader -> new NGramTokenFilter(reader, 1, 2, false)));
    filters.add(PreConfiguredTokenFilter.singleton("persian_normalization", true, PersianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("porter_stem", false, PorterStemFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("reverse", false, ReverseStringFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("russian_stem", false, input -> new SnowballFilter(input, "Russian")));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_folding", true, ScandinavianFoldingFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("scandinavian_normalization", true, ScandinavianNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("shingle", false, input -> {
        TokenStream ts = new ShingleFilter(input);
        /**
         * We disable the graph analysis on this token stream
         * because it produces shingles of different size.
         * Graph analysis on such token stream is useless and dangerous as it may create too many paths
         * since shingles of different size are not aligned in terms of positions.
         */
        ts.addAttribute(DisableGraphAttribute.class);
        return ts;
    }));
    filters.add(PreConfiguredTokenFilter.singleton("snowball", false, input -> new SnowballFilter(input, "English")));
    filters.add(PreConfiguredTokenFilter.singleton("sorani_normalization", true, SoraniNormalizationFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("stemmer", false, PorterStemFilter::new));
    // The stop filter is in lucene-core but the English stop words set is in lucene-analyzers-common
    filters.add(PreConfiguredTokenFilter.singleton("stop", false, input -> new StopFilter(input, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET)));
    filters.add(PreConfiguredTokenFilter.singleton("trim", true, TrimFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("truncate", false, input -> new TruncateTokenFilter(input, 10)));
    filters.add(PreConfiguredTokenFilter.singleton("type_as_payload", false, TypeAsPayloadTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("unique", false, UniqueTokenFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("uppercase", true, UpperCaseFilter::new));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter", false, input ->
            new WordDelimiterFilter(input,
                    WordDelimiterFilter.GENERATE_WORD_PARTS
                  | WordDelimiterFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterFilter.STEM_ENGLISH_POSSESSIVE, null)));
    filters.add(PreConfiguredTokenFilter.singleton("word_delimiter_graph", false, input ->
            new WordDelimiterGraphFilter(input,
                    WordDelimiterGraphFilter.GENERATE_WORD_PARTS
                  | WordDelimiterGraphFilter.GENERATE_NUMBER_PARTS
                  | WordDelimiterGraphFilter.SPLIT_ON_CASE_CHANGE
                  | WordDelimiterGraphFilter.SPLIT_ON_NUMERICS
                  | WordDelimiterGraphFilter.STEM_ENGLISH_POSSESSIVE, null)));
    return filters;
}
 
Example #13
Source File: ShingleTokenFilterFactory.java    From Elasticsearch with Apache License 2.0 4 votes vote down vote up
public Factory(String name) {
    this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN);
}
 
Example #14
Source File: LenientImapSearchAnalyzer.java    From james-project with Apache License 2.0 4 votes vote down vote up
@Override
public TokenStream tokenStream(String arg0, Reader reader) {
    return new ShingleFilter(new UpperCaseFilter(new WhitespaceTokenizer(Version.LUCENE_31, reader)), 2, maxTokenLength);
}
 
Example #15
Source File: ShingleTokenFilterFactory.java    From crate with Apache License 2.0 4 votes vote down vote up
public Factory(String name) {
    this(name, ShingleFilter.DEFAULT_MIN_SHINGLE_SIZE, ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE, true, false, ShingleFilter.DEFAULT_TOKEN_SEPARATOR, ShingleFilter.DEFAULT_FILLER_TOKEN);
}