org.apache.lucene.analysis.util.TokenFilterFactory Java Exaples

Source File: TestSynonymFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

/** Test that analyzer and tokenizerFactory is both specified */
public void testAnalyzer() throws Exception {
  final String analyzer = CJKAnalyzer.class.getName();
  final String tokenizerFactory = PatternTokenizerFactory.class.getName();
  TokenFilterFactory factory = null;

  factory = tokenFilterFactory("Synonym",
      "synonyms", "synonyms2.txt",
      "analyzer", analyzer);
  assertNotNull(factory);

  IllegalArgumentException expected = expectThrows(IllegalArgumentException.class, () -> {
    tokenFilterFactory("Synonym",
        "synonyms", "synonyms.txt",
        "analyzer", analyzer,
        "tokenizerFactory", tokenizerFactory);
  });
  assertTrue(expected.getMessage().contains("Analyzer and TokenizerFactory can't be specified both"));
}

Source File: ResourceLoaderTest.java From lucene-solr with Apache License 2.0

6 votes

public void testCacheWrongType() throws Exception {
  clearCache();

  SolrResourceLoader loader = new SolrResourceLoader();
  @SuppressWarnings({"rawtypes"})
  Class[] params = { Map.class };
  Map<String,String> args = Map.of("minGramSize", "1", "maxGramSize", "2");
  final String className = "solr.NGramTokenizerFactory";

  // We could fail here since the class name and expected type don't match, but instead we try to infer what the user actually meant
  TokenFilterFactory tff = loader.newInstance(className, TokenFilterFactory.class, new String[0], params, new Object[]{new HashMap<>(args)});
  assertNotNull("Did not load TokenFilter when asking for corresponding Tokenizer", tff);

  // This should work, but won't if earlier call succeeding corrupting the cache
  TokenizerFactory tf = loader.newInstance(className, TokenizerFactory.class, new String[0], params, new Object[]{new HashMap<>(args)});
  assertNotNull("Did not load Tokenizer after bad call earlier", tf);
  loader.close();
}

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public void testWhitespaceFactoryWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Source File: CustomAnalyzerStrField.java From lucene-solr with Apache License 2.0

6 votes

public CustomAnalyzerStrField() {
  Random r = LuceneTestCase.random();

  // two arg constructor
  Analyzer a2 = new TokenizerChain
    (new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);
  
  // three arg constructor
  Analyzer a3 = new TokenizerChain
    (r.nextBoolean() ? null : new CharFilterFactory[0],
     new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);

  if (r.nextBoolean()) {
    indexAnalyzer = a2;
    queryAnalyzer = a3;
  } else {
    queryAnalyzer = a2;
    indexAnalyzer = a3;
  }
}

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public void testWhitespaceWithFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer("whitespace")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo bar FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "föó bär FÖÖ BAR", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void addExternalJars(List<String> jarFiles) {
  List<URL> urls = new ArrayList<>();

  for (String jarFile : jarFiles) {
    Path path = FileSystems.getDefault().getPath(jarFile);
    if (!Files.exists(path) || !jarFile.endsWith(".jar")) {
      throw new LukeException(String.format(Locale.ENGLISH, "Invalid jar file path: %s", jarFile));
    }
    try {
      URL url = path.toUri().toURL();
      urls.add(url);
    } catch (IOException e) {
      throw new LukeException(e.getMessage(), e);
    }
  }

  // reload available tokenizers, charfilters, and tokenfilters
  URLClassLoader classLoader = new URLClassLoader(
      urls.toArray(new URL[0]), this.getClass().getClassLoader());
  CharFilterFactory.reloadCharFilters(classLoader);
  TokenizerFactory.reloadTokenizers(classLoader);
  TokenFilterFactory.reloadTokenFilters(classLoader);
}

Source File: PayloadUtils.java From lucene-solr with Apache License 2.0

6 votes

public static String getPayloadEncoder(FieldType fieldType) {
  // TODO: support custom payload encoding fields too somehow - maybe someone has a custom component that encodes payloads as floats
  String encoder = null;
  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain for DelimitedPayloadTokenFilterFactory or NumericPayloadTokenFilterFactory
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof DelimitedPayloadTokenFilterFactory) {
        encoder = factory.getOriginalArgs().get(DelimitedPayloadTokenFilterFactory.ENCODER_ATTR);
        break;
      }

      if (factory instanceof NumericPayloadTokenFilterFactory) {
        // encodes using `PayloadHelper.encodeFloat(payload)`
        encoder = "float";
        break;
      }
    }
  }

  return encoder;
}

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public void testStopWordsFromClasspath() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withTokenizer(WhitespaceTokenizerFactory.class)
      .addTokenFilter("stop",
          "ignoreCase", "true",
          "words", "org/apache/lucene/analysis/custom/teststop.txt",
          "format", "wordset")
      .build();
  
  assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass());
  assertEquals(Collections.emptyList(), a.getCharFilterFactories());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(1, tokenFilters.size());
  assertSame(StopFilterFactory.class, tokenFilters.get(0).getClass());
  assertEquals(0, a.getPositionIncrementGap("dummy"));
  assertEquals(1, a.getOffsetGap("dummy"));
  assertSame(Version.LATEST, a.getVersion());

  assertAnalyzesTo(a, "foo Foo Bar", new String[0]);
  a.close();
}

Source File: TestAsciiFoldingFilterFactory.java From lucene-solr with Apache License 2.0

6 votes

public void testMultiTermAnalysis() throws IOException {
  TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap());
  TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3));
  stream = factory.create(stream);
  assertTokenStreamContents(stream, new String[] { "Ete" });

  stream = new CannedTokenStream(new Token("Été", 0, 3));
  stream = factory.normalize(stream);
  assertTokenStreamContents(stream, new String[] { "Ete" });

  factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true")));
  stream = new CannedTokenStream(new Token("Été", 0, 3));
  stream = factory.create(stream);
  assertTokenStreamContents(stream, new String[] { "Ete", "Été" });

  stream = new CannedTokenStream(new Token("Été", 0, 3));
  stream = factory.normalize(stream);
  assertTokenStreamContents(stream, new String[] { "Ete" });
}

Source File: PhrasesIdentificationComponent.java From lucene-solr with Apache License 2.0

6 votes

/** 
 * Helper method, public for testing purposes only.
 * <p>
 * Given an analyzer, inspects it to determine if:
 * <ul>
 *  <li>it is a {@link TokenizerChain}</li>
 *  <li>it contains exactly one instance of {@link ShingleFilterFactory}</li>
 * </ul>
 * <p>
 * If these these conditions are met, then this method returns the <code>maxShingleSize</code> 
 * in effect for this analyzer, otherwise returns -1.
 * </p>
 * 
 * @param analyzer An analyzer inspect
 * @return <code>maxShingleSize</code> if available
 * @lucene.internal
 */
public static int getMaxShingleSize(Analyzer analyzer) {
  if (!TokenizerChain.class.isInstance(analyzer)) {
    return -1;
  }
  
  final TokenFilterFactory[] factories = ((TokenizerChain) analyzer).getTokenFilterFactories();
  if (0 == factories.length) {
    return -1;
  }
  int result = -1;
  for (TokenFilterFactory tff : factories) {
    if (ShingleFilterFactory.class.isInstance(tff)) {
      if (0 < result) {
        // more then one shingle factory in our analyzer, which is weird, so make no assumptions...
        return -1;
      }
      // would be nice if there was an easy way to just ask a factory for the effective value
      // of an arguement...
      final Map<String,String> args = tff.getOriginalArgs();
      result = args.containsKey("maxShingleSize")
        ? Integer.parseInt(args.get("maxShingleSize")) : ShingleFilter.DEFAULT_MAX_SHINGLE_SIZE;
    }
  }
  return result;
}

Source File: SolrQueryParserBase.java From lucene-solr with Apache License 2.0

6 votes

protected ReversedWildcardFilterFactory getReversedWildcardFilterFactory(FieldType fieldType) {
  if (leadingWildcards == null) leadingWildcards = new HashMap<>();
  ReversedWildcardFilterFactory fac = leadingWildcards.get(fieldType);
  if (fac != null || leadingWildcards.containsKey(fieldType)) {
    return fac;
  }

  Analyzer a = fieldType.getIndexAnalyzer();
  if (a instanceof TokenizerChain) {
    // examine the indexing analysis chain if it supports leading wildcards
    TokenizerChain tc = (TokenizerChain)a;
    TokenFilterFactory[] factories = tc.getTokenFilterFactories();
    for (TokenFilterFactory factory : factories) {
      if (factory instanceof ReversedWildcardFilterFactory) {
        fac = (ReversedWildcardFilterFactory)factory;
        break;
      }
    }
  }

  leadingWildcards.put(fieldType, fac);
  return fac;
}

Source File: TaggerRequestHandler.java From SolrTextTagger with Apache License 2.0

5 votes

private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}

Source File: TestFactories.java From lucene-solr with Apache License 2.0

5 votes

public void test() throws IOException {
  for (String tokenizer : TokenizerFactory.availableTokenizers()) {
    doTestTokenizer(tokenizer);
  }
  
  for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
    doTestTokenFilter(tokenFilter);
  }
  
  for (String charFilter : CharFilterFactory.availableCharFilters()) {
    doTestCharFilter(charFilter);
  }
}

Source File: ResourceLoaderTest.java From lucene-solr with Apache License 2.0

5 votes

@SuppressWarnings({"rawtypes", "deprecation"})
public void testLoadDeprecatedFactory() throws Exception {
  SolrResourceLoader loader = new SolrResourceLoader(Paths.get("solr/collection1").toAbsolutePath());
  // ensure we get our exception
  loader.newInstance(DeprecatedTokenFilterFactory.class.getName(), TokenFilterFactory.class, null,
      new Class[] { Map.class }, new Object[] { new HashMap<String,String>() });
  // TODO: How to check that a warning was printed to log file?
  loader.close();    
}

Source File: TokenizerChainTest.java From lucene-solr with Apache License 2.0

5 votes

@Test
@SuppressWarnings({"unchecked"})
public void testNormalization() throws Exception {
  String fieldName = "f";
  TokenFilterFactory[] tff = new TokenFilterFactory[2];
  tff[0] = new LowerCaseFilterFactory(Collections.EMPTY_MAP);
  tff[1] = new ASCIIFoldingFilterFactory(Collections.EMPTY_MAP);
  TokenizerChain tokenizerChain = new TokenizerChain(
      new MockTokenizerFactory(Collections.EMPTY_MAP),
      tff);
  assertEquals(new BytesRef("fooba"),
      tokenizerChain.normalize(fieldName, "FOOB\u00c4"));
  tokenizerChain.close();
}

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

/** 
 * Creates a new TokenizerChain.
 *
 * @param charFilters Factories for the CharFilters to use, if any - if null, will be treated as if empty.
 * @param tokenizer Factory for the Tokenizer to use, must not be null.
 * @param filters Factories for the TokenFilters to use if any- if null, will be treated as if empty.
 */
public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
  charFilters = null == charFilters ? EMPTY_CHAR_FITLERS : charFilters;
  filters = null == filters ? EMPTY_TOKEN_FITLERS : filters;
  if (null == tokenizer) {
    throw new NullPointerException("TokenizerFactory must not be null");
  }
  
  this.charFilters = charFilters;
  this.tokenizer = tokenizer;
  this.filters = filters;
}

Source File: SolrStopwordsCarrot2LexicalDataFactory.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Obtains stop words for a field from the associated
 * {@link StopFilterFactory}, if any.
 */
private List<CharArraySet> getSolrStopWordsForField(String fieldName) {
  // No need to synchronize here, Carrot2 ensures that instances
  // of this class are not used by multiple threads at a time.
  synchronized (solrStopWords) {
    if (!solrStopWords.containsKey(fieldName)) {
      solrStopWords.put(fieldName, new ArrayList<>());

      IndexSchema schema = core.getLatestSchema();
      final Analyzer fieldAnalyzer = schema.getFieldType(fieldName).getIndexAnalyzer();
      if (fieldAnalyzer instanceof TokenizerChain) {
        final TokenFilterFactory[] filterFactories = 
            ((TokenizerChain) fieldAnalyzer).getTokenFilterFactories();
        for (TokenFilterFactory factory : filterFactories) {
          if (factory instanceof StopFilterFactory) {
            // StopFilterFactory holds the stop words in a CharArraySet
            CharArraySet stopWords = ((StopFilterFactory) factory).getStopWords();
            solrStopWords.get(fieldName).add(stopWords);
          }

          if (factory instanceof CommonGramsFilterFactory) {
            CharArraySet commonWords = ((CommonGramsFilterFactory) factory).getCommonWords();
            solrStopWords.get(fieldName).add(commonWords);
          }
        }
      }
    }
    return solrStopWords.get(fieldName);
  }
}

Source File: TestSynonymFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/** checks for synonyms of "second" in synonyms-wordnet.txt */
private void checkWordnetSynonyms(TokenFilterFactory factory) throws Exception {
  Reader reader = new StringReader("second");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = factory.create(stream);
  assertTrue(stream instanceof SynonymFilter);
  assertTokenStreamContents(stream,
      new String[] { "second", "2nd", "two" },
      new int[] { 1, 0, 0 });
}

Source File: TestTypeTokenFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

public void testCreationWithWhiteList() throws Exception {
  TokenFilterFactory factory = tokenFilterFactory("Type",
      "types", "stoptypes-1.txt, stoptypes-2.txt",
      "useWhitelist", "true");
  CannedTokenStream input = new CannedTokenStream();
  factory.create(input);
}

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected TokenStreamComponents createComponents(String fieldName) {
  Tokenizer tk = tokenizer.create(attributeFactory(fieldName));
  TokenStream ts = tk;
  for (TokenFilterFactory filter : filters) {
    ts = filter.create(ts);
  }
  return new TokenStreamComponents(tk, ts);
}

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected TokenStream normalize(String fieldName, TokenStream in) {
  TokenStream result = in;
  for (TokenFilterFactory filter : filters) {
    result = filter.normalize(result);
  }
  return result;
}

Source File: PluginsService.java From crate with Apache License 2.0

5 votes

/**
 * Reloads all Lucene SPI implementations using the new classloader.
 * This method must be called after the new classloader has been created to
 * register the services for use.
 */
static void reloadLuceneSPI(ClassLoader loader) {
    // do NOT change the order of these method calls!

    // Codecs:
    PostingsFormat.reloadPostingsFormats(loader);
    DocValuesFormat.reloadDocValuesFormats(loader);
    Codec.reloadCodecs(loader);
    // Analysis:
    CharFilterFactory.reloadCharFilters(loader);
    TokenFilterFactory.reloadTokenFilters(loader);
    TokenizerFactory.reloadTokenizers(loader);
}

Source File: AnalysisFactoryTestCase.java From crate with Apache License 2.0

5 votes

public void testTokenFilters() {
    Set<String> missing = new TreeSet<String>();
    missing.addAll(org.apache.lucene.analysis.util.TokenFilterFactory.availableTokenFilters()
                       .stream().map(key -> key.toLowerCase(Locale.ROOT)).collect(Collectors.toSet()));
    missing.removeAll(getTokenFilters().keySet());
    assertTrue("new tokenfilters found, please update KNOWN_TOKENFILTERS: " + missing.toString(), missing.isEmpty());
}

Source File: FieldAnalysisRequestHandlerTest.java From lucene-solr with Apache License 2.0

5 votes

@Test //See SOLR-8460
public void testCustomAttribute() throws Exception {
  FieldAnalysisRequest request = new FieldAnalysisRequest();
  request.addFieldType("skutype1");
  request.setFieldValue("hi, 3456-12 a Test");
  request.setShowMatch(false);
  FieldType fieldType = new TextField();
  Analyzer analyzer = new TokenizerChain(
      new TokenizerFactory(Collections.emptyMap()) {
        @Override
        public Tokenizer create(AttributeFactory factory) {
          return new CustomTokenizer(factory);
        }
      },
      new TokenFilterFactory[] {
          new TokenFilterFactory(Collections.emptyMap()) {
            @Override
            public TokenStream create(TokenStream input) {
              return new CustomTokenFilter(input);
            }
          }
      }
  );
  fieldType.setIndexAnalyzer(analyzer);

  @SuppressWarnings({"rawtypes"})
  NamedList<NamedList> result = handler.analyzeValues(request, fieldType, "fieldNameUnused");
  // just test that we see "900" in the flags attribute here
  @SuppressWarnings({"unchecked", "rawtypes"})
  List<NamedList> tokenInfoList = (List<NamedList>) result.findRecursive("index", CustomTokenFilter.class.getName());
  // '1' from CustomTokenFilter plus 900 from CustomFlagsAttributeImpl.
  assertEquals(901, tokenInfoList.get(0).get("org.apache.lucene.analysis.tokenattributes.FlagsAttribute#flags"));
}

Source File: TestFactories.java From lucene-solr with Apache License 2.0

5 votes

private void doTestTokenFilter(String tokenfilter) throws IOException {
  Class<? extends TokenFilterFactory> factoryClazz = TokenFilterFactory.lookupClass(tokenfilter);
  TokenFilterFactory factory = (TokenFilterFactory) initialize(factoryClazz);
  if (factory != null) {
    // we managed to fully create an instance. check a few more things:
    if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
      // beast it just a little, it shouldnt throw exceptions:
      // (it should have thrown them in initialize)
      Analyzer a = new FactoryAnalyzer(assertingTokenizer, factory, null);
      checkRandomData(random(), a, 3, 20, false, false);
      a.close();
    }
  }
}

Source File: TaggerRequestHandler.java From lucene-solr with Apache License 2.0

5 votes

private boolean fieldHasIndexedStopFilter(String field, SolrQueryRequest req) {
  FieldType fieldType = req.getSchema().getFieldType(field);
  Analyzer analyzer = fieldType.getIndexAnalyzer();//index analyzer
  if (analyzer instanceof TokenizerChain) {
    TokenizerChain tokenizerChain = (TokenizerChain) analyzer;
    TokenFilterFactory[] tokenFilterFactories = tokenizerChain.getTokenFilterFactories();
    for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
      if (tokenFilterFactory instanceof StopFilterFactory)
        return true;
    }
  }
  return false;
}

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testHtmlStripClassicFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withDefaultMatchVersion(LUCENE_8_0_0)
      .addCharFilter("htmlstrip")
      .withTokenizer("classic")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .withPositionIncrementGap(100)
      .withOffsetGap(1000)
      .build();
  
  assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
  List<CharFilterFactory> charFilters = a.getCharFilterFactories();
  assertEquals(1, charFilters.size());
  assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(100, a.getPositionIncrementGap("dummy"));
  assertEquals(1000, a.getOffsetGap("dummy"));
  assertSame(LUCENE_8_0_0, a.getVersion());

  assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "<p><b>föó</b> bär     FÖÖ BAR</p>", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testFactoryHtmlStripClassicFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withDefaultMatchVersion(LUCENE_8_0_0)
      .addCharFilter(HTMLStripCharFilterFactory.class)
      .withTokenizer(ClassicTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .withPositionIncrementGap(100)
      .withOffsetGap(1000)
      .build();
  
  assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
  List<CharFilterFactory> charFilters = a.getCharFilterFactories();
  assertEquals(1, charFilters.size());
  assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(100, a.getPositionIncrementGap("dummy"));
  assertEquals(1000, a.getOffsetGap("dummy"));
  assertSame(LUCENE_8_0_0, a.getVersion());

  assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "<p><b>föó</b> bär     FÖÖ BAR</p>", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Source File: TestSynonymFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/** Test that we can parse TokenierFactory's arguments */
public void testTokenizerFactoryArguments() throws Exception {
  final String clazz = PatternTokenizerFactory.class.getName();
  TokenFilterFactory factory = null;

  // simple arg form
  factory = tokenFilterFactory("Synonym", 
      "synonyms", "synonyms.txt", 
      "tokenizerFactory", clazz,
      "pattern", "(.*)",
      "group", "0");
  assertNotNull(factory);
  // prefix
  factory = tokenFilterFactory("Synonym", 
      "synonyms", "synonyms.txt", 
      "tokenizerFactory", clazz,
      "tokenizerFactory.pattern", "(.*)",
      "tokenizerFactory.group", "0");
  assertNotNull(factory);

  // sanity check that sub-PatternTokenizerFactory fails w/o pattern
  expectThrows(Exception.class, () -> {
    tokenFilterFactory("Synonym", 
        "synonyms", "synonyms.txt", 
        "tokenizerFactory", clazz);
  });

  // sanity check that sub-PatternTokenizerFactory fails on unexpected
  expectThrows(Exception.class, () -> {
    tokenFilterFactory("Synonym", 
        "synonyms", "synonyms.txt", 
        "tokenizerFactory", clazz,
        "tokenizerFactory.pattern", "(.*)",
        "tokenizerFactory.bogusbogusbogus", "bogus",
        "tokenizerFactory.group", "0");
  });
}

Source File: TestSynonymFilterFactory.java From lucene-solr with Apache License 2.0

5 votes

/** checks for synonyms of "GB" in synonyms.txt */
private void checkSolrSynonyms(TokenFilterFactory factory) throws Exception {
  Reader reader = new StringReader("GB");
  TokenStream stream = whitespaceMockTokenizer(reader);
  stream = factory.create(stream);
  assertTrue(stream instanceof SynonymFilter);
  assertTokenStreamContents(stream,
      new String[] { "GB", "gib", "gigabyte", "gigabytes" },
      new int[] { 1, 0, 0, 0 });
}

org.apache.lucene.analysis.util.TokenFilterFactory Java Examples