org.apache.lucene.analysis.TokenFilter Java Exaples

Source File: ThrowingMockTokenFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

@Override
public TokenStream create(TokenStream input) {
  return new TokenFilter(input) {
    @Override
    public boolean incrementToken() throws IOException {
      if (input.incrementToken()) {
        try {
          throw exceptionClass.getConstructor().newInstance();
        } catch (IllegalAccessException | InstantiationException | InvocationTargetException | NoSuchMethodException iae) {
          throw new RuntimeException(iae);
        }
      }
      return false;
    }
  };
}

Source File: TestConcatenateGraphFilter.java From lucene-solr with Apache License 2.0

6 votes

@Test
  public void testWithStopword() throws Exception {
    for (boolean preservePosInc : new boolean[]{true, false}) {
      Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, true);
      String input = "a mykeyword a keyword"; //LUCENE-8344 add "a"
      tokenStream.setReader(new StringReader(input));
      TokenFilter tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("a"));
      ConcatenateGraphFilter concatStream = new ConcatenateGraphFilter(tokenFilter, SEP_LABEL, preservePosInc, 10);
      CharsRefBuilder builder = new CharsRefBuilder();
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("mykeyword");
      builder.append(SEP_LABEL);
      if (preservePosInc) {
        builder.append(SEP_LABEL);
      }
      builder.append("keyword");
//      if (preservePosInc) { LUCENE-8344 uncomment
//        builder.append(SEP_LABEL);
//      }
      assertTokenStreamContents(concatStream, new String[]{builder.toCharsRef().toString()});
    }
  }

Source File: NGramTokenFilterTest.java From lucene-solr with Apache License 2.0

6 votes

public void testInvalidOffsets() throws Exception {
  Analyzer analyzer = new Analyzer() {
    @Override
    protected TokenStreamComponents createComponents(String fieldName) {
      Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);
      TokenFilter filters = new ASCIIFoldingFilter(tokenizer);
      filters = new NGramTokenFilter(filters, 2, 2, false);
      return new TokenStreamComponents(tokenizer, filters);
    }
  };
  assertAnalyzesTo(analyzer, "mosfellsbær",
      new String[] { "mo", "os", "sf", "fe", "el", "ll", "ls", "sb", "ba", "ae", "er" },
      new int[]    {    0,    0,    0,    0,    0,    0,    0,    0,    0,    0,    0 },
      new int[]    {   11,   11,   11,   11,   11,   11,   11,   11,   11,   11,   11 },
      new int[]    {     1,   0,    0,    0,    0,    0,    0,    0,    0,    0,    0  });
  analyzer.close();
}

Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test that CommonGramsFilter works correctly in case-insensitive mode
 */
public void testCaseSensitive() throws Exception {
  final String input = "How The s a brown s cow d like A B thing?";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  TokenFilter cgf = new CommonGramsFilter(wt, commonWords);
  assertTokenStreamContents(cgf, new String[] {"How", "The", "The_s", "s",
      "s_a", "a", "a_brown", "brown", "brown_s", "s", "s_cow", "cow",
      "cow_d", "d", "d_like", "like", "A", "B", "thing?"});
}

Source File: TestElision.java From lucene-solr with Apache License 2.0

5 votes

public void testElision() throws Exception {
  String test = "Plop, juste pour voir l'embrouille avec O'brian. M'enfin.";
  Tokenizer tokenizer = new StandardTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(test));
  CharArraySet articles = new CharArraySet(asSet("l", "M"), false);
  TokenFilter filter = new ElisionFilter(tokenizer, articles);
  List<String> tas = filter(filter);
  assertEquals("embrouille", tas.get(4));
  assertEquals("O'brian", tas.get(6));
  assertEquals("enfin", tas.get(7));
}

Source File: SegmentationIcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0

5 votes

private static Analyzer createAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(String fieldName) {
            Tokenizer tokenizer = new IcuTokenizer(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY,
                    new DefaultIcuTokenizerConfig(false, true));
            TokenFilter filter = new IcuNormalizerFilter(tokenizer,
                    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));
            return new TokenStreamComponents(tokenizer, filter);
        }
    };
}

Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test CommonGramsQueryFilter in the case that the last word is a stopword
 */
public void testLastWordisStopWord() throws Exception {
  final String input = "dog the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "dog_the" });
}

Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0

5 votes

@NotNull
private static Analyzer concatenatingAnalyzer() {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            TokenFilter concatenatingFilter = new ConcatenatingFilter(defaultTokenFilter(source), ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}

Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0

5 votes

@NotNull
private static Analyzer spellcheckAnalyzer(@NotNull SpellChecker spellChecker) {
    return new Analyzer() {
        @Override
        protected TokenStreamComponents createComponents(@NotNull String field) {
            Tokenizer source = new WhitespaceTokenizer();
            source.setReader(new StringReader(field));
            SpellCheckerTokenFilter spellCheckFilter = new SpellCheckerTokenFilter(defaultTokenFilter(source), spellChecker);
            TokenFilter concatenatingFilter = new ConcatenatingFilter(spellCheckFilter, ' ');
            return new TokenStreamComponents(source, concatenatingFilter);
        }
    };
}

Source File: TestDocInverterPerFieldErrorInfo.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tokenizer = new MockTokenizer();
  if (fieldName.equals("distinctiveFieldName")) {
    TokenFilter tosser = new TokenFilter(tokenizer) {
      @Override
      public boolean incrementToken() throws IOException {
        throw new BadNews("Something is icky.");
      }
    };
    return new TokenStreamComponents(tokenizer, tosser);
  } else {
    return new TokenStreamComponents(tokenizer);
  }
}

Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test CommonGramsQueryFilter in the case that the first word is a stopword
 */
public void testFirstWordisStopWord() throws Exception {
  final String input = "the dog";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_dog" });
}

Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test CommonGramsQueryFilter in the case of a single (stop)word query
 */
public void testOneWordQueryStopWord() throws Exception {
  final String input = "the";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the" });
}

Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test CommonGramsQueryFilter when first and last words are stopwords.
 */
public void TestFirstAndLastStopWord() throws Exception {
  final String input = "the of";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "the_of" });
}

Source File: CommonGramsFilterTest.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Test CommonGramsQueryFilter in the case of a single word query
 */
public void testOneWordQuery() throws Exception {
  final String input = "monster";
  MockTokenizer wt = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  wt.setReader(new StringReader(input));
  CommonGramsFilter cgf = new CommonGramsFilter(wt, commonWords);
  TokenFilter nsf = new CommonGramsQueryFilter(cgf);
  assertTokenStreamContents(nsf, new String[] { "monster" });
}

Source File: TestRandomChains.java From lucene-solr with Apache License 2.0

4 votes

@BeforeClass
public static void beforeClass() throws Exception {
  List<Class<?>> analysisClasses = getClassesForPackage("org.apache.lucene.analysis");
  tokenizers = new ArrayList<>();
  tokenfilters = new ArrayList<>();
  charfilters = new ArrayList<>();
  for (final Class<?> c : analysisClasses) {
    final int modifiers = c.getModifiers();
    if (
      // don't waste time with abstract classes or deprecated known-buggy ones
      Modifier.isAbstract(modifiers) || !Modifier.isPublic(modifiers)
      || c.isSynthetic() || c.isAnonymousClass() || c.isMemberClass() || c.isInterface()
      || c.isAnnotationPresent(Deprecated.class)
      || !(Tokenizer.class.isAssignableFrom(c) || TokenFilter.class.isAssignableFrom(c) || CharFilter.class.isAssignableFrom(c))
    ) {
      continue;
    }
    
    for (final Constructor<?> ctor : c.getConstructors()) {
      // don't test synthetic or deprecated ctors, they likely have known bugs:
      if (ctor.isSynthetic() || ctor.isAnnotationPresent(Deprecated.class) || brokenConstructors.get(ctor) == ALWAYS) {
        continue;
      }
      // conditional filters are tested elsewhere
      if (ConditionalTokenFilter.class.isAssignableFrom(c)) {
        continue;
      }
      if (Tokenizer.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedTokenizerArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        tokenizers.add(castConstructor(Tokenizer.class, ctor));
      } else if (TokenFilter.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedTokenFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        tokenfilters.add(castConstructor(TokenFilter.class, ctor));
      } else if (CharFilter.class.isAssignableFrom(c)) {
        assertTrue(ctor.toGenericString() + " has unsupported parameter types",
          allowedCharFilterArgs.containsAll(Arrays.asList(ctor.getParameterTypes())));
        charfilters.add(castConstructor(CharFilter.class, ctor));
      } else {
        fail("Cannot get here");
      }
    }
  }
  
  final Comparator<Constructor<?>> ctorComp = (arg0, arg1) -> arg0.toGenericString().compareTo(arg1.toGenericString());
  Collections.sort(tokenizers, ctorComp);
  Collections.sort(tokenfilters, ctorComp);
  Collections.sort(charfilters, ctorComp);
  if (VERBOSE) {
    System.out.println("tokenizers = " + tokenizers);
    System.out.println("tokenfilters = " + tokenfilters);
    System.out.println("charfilters = " + charfilters);
  }
}

Source File: PatternCaptureGroupTokenFilterFactory.java From crate with Apache License 2.0

4 votes

@Override
public TokenFilter create(TokenStream tokenStream) {
    return new PatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
}

Source File: TestRandomChains.java From lucene-solr with Apache License 2.0

4 votes

private TokenFilterSpec newFilterChain(Random random, Tokenizer tokenizer) {
  TokenFilterSpec spec = new TokenFilterSpec();
  spec.stream = tokenizer;
  StringBuilder descr = new StringBuilder();
  int numFilters = random.nextInt(5);
  for (int i = 0; i < numFilters; i++) {

    // Insert ValidatingTF after each stage so we can
    // catch problems right after the TF that "caused"
    // them:
    spec.stream = new ValidatingTokenFilter(spec.stream, "stage " + i);

    while (true) {
      final Constructor<? extends TokenFilter> ctor = tokenfilters.get(random.nextInt(tokenfilters.size()));
      if (random.nextBoolean() && avoidConditionals.contains(ctor.getDeclaringClass()) == false) {
        long seed = random.nextLong();
        spec.stream = new ConditionalTokenFilter(spec.stream, in -> {
          final Object args[] = newFilterArgs(random, in, ctor.getParameterTypes());
          if (broken(ctor, args)) {
            return in;
          }
          TokenStream ts = createComponent(ctor, args, descr, true);
          if (ts == null) {
            return in;
          }
          return ts;
        }) {
          Random random = new Random(seed);

          @Override
          public void reset() throws IOException {
            super.reset();
            random = new Random(seed);
          }

          @Override
          protected boolean shouldFilter() throws IOException {
            return random.nextBoolean();
          }
        };
        break;
      }
      else {
        final Object args[] = newFilterArgs(random, spec.stream, ctor.getParameterTypes());
        if (broken(ctor, args)) {
          continue;
        }
        final TokenFilter flt = createComponent(ctor, args, descr, false);
        if (flt != null) {
          spec.stream = flt;
          break;
        }
      }
    }
  }

  // Insert ValidatingTF after each stage so we can
  // catch problems right after the TF that "caused"
  // them:
  spec.stream = new ValidatingTokenFilter(spec.stream, "last stage");

  spec.toString = descr.toString();
  return spec;
}

Source File: TreatmentCurator.java From hmftools with GNU General Public License v3.0

4 votes

@NotNull
private static TokenFilter defaultTokenFilter(@NotNull Tokenizer source) {
    TokenFilter filteredSource = new LowerCaseFilter(source);
    return new WordDelimiterGraphFilter(filteredSource, SPLIT_ON_NUMERICS | GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS, null);
}

Source File: TextFieldMapper.java From crate with Apache License 2.0

4 votes

@Override
protected TokenStreamComponents wrapComponents(String fieldName, TokenStreamComponents components) {
    TokenFilter filter = new EdgeNGramTokenFilter(components.getTokenStream(), minChars, maxChars, false);
    return new TokenStreamComponents(components.getSource(), filter);
}

Source File: PatternCaptureGroupTokenFilterFactory.java From Elasticsearch with Apache License 2.0

4 votes

@Override
public TokenFilter create(TokenStream tokenStream) {
    return new PatternCaptureGroupTokenFilter(tokenStream, preserveOriginal, patterns);
}

Source File: TestBengaliNormalizer.java From lucene-solr with Apache License 2.0

4 votes

private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new BengaliNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}

Source File: TestBengaliStemmer.java From lucene-solr with Apache License 2.0

4 votes

private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new BengaliStemFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}

Source File: TestHindiNormalizer.java From lucene-solr with Apache License 2.0

4 votes

private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new HindiNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}

Source File: TestHindiStemmer.java From lucene-solr with Apache License 2.0

4 votes

private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = whitespaceMockTokenizer(input);
  TokenFilter tf = new HindiStemFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}

Source File: TestIndicNormalizer.java From lucene-solr with Apache License 2.0

4 votes

private void check(String input, String output) throws IOException {
  Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false);;
  tokenizer.setReader(new StringReader(input));
  TokenFilter tf = new IndicNormalizationFilter(tokenizer);
  assertTokenStreamContents(tf, new String[] { output });
}

Source File: WordDelimiterGraphFilterFactory.java From lucene-solr with Apache License 2.0

4 votes

@Override
public TokenFilter create(TokenStream input) {
  return new WordDelimiterGraphFilter(input, adjustOffsets, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                      flags, protectedWords);
}

Source File: WordDelimiterFilterFactory.java From lucene-solr with Apache License 2.0

4 votes

@Override
public TokenFilter create(TokenStream input) {
  return new WordDelimiterFilter(input, typeTable == null ? WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE : typeTable,
                                 flags, protectedWords);
}

Source File: CommonGramsQueryFilterFactory.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Create a CommonGramsFilter and wrap it with a CommonGramsQueryFilter
 */
@Override
public TokenFilter create(TokenStream input) {
  CommonGramsFilter commonGrams = (CommonGramsFilter) super.create(input);
  return new CommonGramsQueryFilter(commonGrams);
}

Source File: CommonGramsFilterFactory.java From lucene-solr with Apache License 2.0

4 votes

@Override
public TokenFilter create(TokenStream input) {
  CommonGramsFilter commonGrams = new CommonGramsFilter(input, commonWords);
  return commonGrams;
}

Source File: KStemFilterFactory.java From lucene-solr with Apache License 2.0

4 votes

@Override
public TokenFilter create(TokenStream input) {
  return new KStemFilter(input);
}

org.apache.lucene.analysis.TokenFilter Java Examples