org.apache.lucene.analysis.util.CharFilterFactory Java Exaples

Source File: CustomAnalyzerStrField.java From lucene-solr with Apache License 2.0

6 votes

public CustomAnalyzerStrField() {
  Random r = LuceneTestCase.random();

  // two arg constructor
  Analyzer a2 = new TokenizerChain
    (new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);
  
  // three arg constructor
  Analyzer a3 = new TokenizerChain
    (r.nextBoolean() ? null : new CharFilterFactory[0],
     new KeywordTokenizerFactory(new HashMap<>()),
     r.nextBoolean() ? null : new TokenFilterFactory[0]);

  if (r.nextBoolean()) {
    indexAnalyzer = a2;
    queryAnalyzer = a3;
  } else {
    queryAnalyzer = a2;
    indexAnalyzer = a3;
  }
}

Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void addExternalJars(List<String> jarFiles) {
  List<URL> urls = new ArrayList<>();

  for (String jarFile : jarFiles) {
    Path path = FileSystems.getDefault().getPath(jarFile);
    if (!Files.exists(path) || !jarFile.endsWith(".jar")) {
      throw new LukeException(String.format(Locale.ENGLISH, "Invalid jar file path: %s", jarFile));
    }
    try {
      URL url = path.toUri().toURL();
      urls.add(url);
    } catch (IOException e) {
      throw new LukeException(e.getMessage(), e);
    }
  }

  // reload available tokenizers, charfilters, and tokenfilters
  URLClassLoader classLoader = new URLClassLoader(
      urls.toArray(new URL[0]), this.getClass().getClassLoader());
  CharFilterFactory.reloadCharFilters(classLoader);
  TokenizerFactory.reloadTokenizers(classLoader);
  TokenFilterFactory.reloadTokenFilters(classLoader);
}

Source File: AlfrescoFieldType.java From SearchServices with GNU Lesser General Public License v3.0

5 votes

public void add(Object current)
{
    if (!(current instanceof MultiTermAwareComponent))
        return;
    AbstractAnalysisFactory newComponent = ((MultiTermAwareComponent) current).getMultiTermComponent();
    if (newComponent instanceof TokenFilterFactory)
    {
        if (filters == null)
        {
            filters = new ArrayList<TokenFilterFactory>(2);
        }
        filters.add((TokenFilterFactory) newComponent);
    }
    else if (newComponent instanceof TokenizerFactory)
    {
        tokenizer = (TokenizerFactory) newComponent;
    }
    else if (newComponent instanceof CharFilterFactory)
    {
        if (charFilters == null)
        {
            charFilters = new ArrayList<CharFilterFactory>(1);
        }
        charFilters.add((CharFilterFactory) newComponent);

    }
    else
    {
        throw new SolrException(SolrException.ErrorCode.SERVER_ERROR, "Unknown analysis component from MultiTermAwareComponent: " + newComponent);
    }
}

Source File: PluginsService.java From crate with Apache License 2.0

5 votes

/**
 * Reloads all Lucene SPI implementations using the new classloader.
 * This method must be called after the new classloader has been created to
 * register the services for use.
 */
static void reloadLuceneSPI(ClassLoader loader) {
    // do NOT change the order of these method calls!

    // Codecs:
    PostingsFormat.reloadPostingsFormats(loader);
    DocValuesFormat.reloadDocValuesFormats(loader);
    Codec.reloadCodecs(loader);
    // Analysis:
    CharFilterFactory.reloadCharFilters(loader);
    TokenFilterFactory.reloadTokenFilters(loader);
    TokenizerFactory.reloadTokenizers(loader);
}

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
  if (charFilters != null && charFilters.length > 0) {
    for (CharFilterFactory charFilter : charFilters) {
      reader = charFilter.normalize(reader);
    }
  }
  return reader;
}

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Reader initReader(String fieldName, Reader reader) {
  if (charFilters != null && charFilters.length > 0) {
    Reader cs = reader;
    for (CharFilterFactory charFilter : charFilters) {
      cs = charFilter.create(cs);
    }
    reader = cs;
  }
  return reader;
}

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

5 votes

/** 
 * Creates a new TokenizerChain.
 *
 * @param charFilters Factories for the CharFilters to use, if any - if null, will be treated as if empty.
 * @param tokenizer Factory for the Tokenizer to use, must not be null.
 * @param filters Factories for the TokenFilters to use if any- if null, will be treated as if empty.
 */
public TokenizerChain(CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] filters) {
  charFilters = null == charFilters ? EMPTY_CHAR_FITLERS : charFilters;
  filters = null == filters ? EMPTY_TOKEN_FITLERS : filters;
  if (null == tokenizer) {
    throw new NullPointerException("TokenizerFactory must not be null");
  }
  
  this.charFilters = charFilters;
  this.tokenizer = tokenizer;
  this.filters = filters;
}

Source File: TestFactories.java From lucene-solr with Apache License 2.0

5 votes

private void doTestCharFilter(String charfilter) throws IOException {
  Class<? extends CharFilterFactory> factoryClazz = CharFilterFactory.lookupClass(charfilter);
  CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
  if (factory != null) {
    // we managed to fully create an instance. check a few more things:
    if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
      // beast it just a little, it shouldnt throw exceptions:
      // (it should have thrown them in initialize)
      Analyzer a = new FactoryAnalyzer(assertingTokenizer, null, factory);
      checkRandomData(random(), a, 3, 20, false, false);
      a.close();
    }
  }
}

Source File: TestFactories.java From lucene-solr with Apache License 2.0

5 votes

public void test() throws IOException {
  for (String tokenizer : TokenizerFactory.availableTokenizers()) {
    doTestTokenizer(tokenizer);
  }
  
  for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
    doTestTokenFilter(tokenFilter);
  }
  
  for (String charFilter : CharFilterFactory.availableCharFilters()) {
    doTestCharFilter(charFilter);
  }
}

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testHtmlStripClassicFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withDefaultMatchVersion(LUCENE_8_0_0)
      .addCharFilter("htmlstrip")
      .withTokenizer("classic")
      .addTokenFilter("asciifolding", "preserveOriginal", "true")
      .addTokenFilter("lowercase")
      .withPositionIncrementGap(100)
      .withOffsetGap(1000)
      .build();
  
  assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
  List<CharFilterFactory> charFilters = a.getCharFilterFactories();
  assertEquals(1, charFilters.size());
  assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(100, a.getPositionIncrementGap("dummy"));
  assertEquals(1000, a.getOffsetGap("dummy"));
  assertSame(LUCENE_8_0_0, a.getVersion());

  assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "<p><b>föó</b> bär     FÖÖ BAR</p>", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testFactoryHtmlStripClassicFolding() throws Exception {
  CustomAnalyzer a = CustomAnalyzer.builder()
      .withDefaultMatchVersion(LUCENE_8_0_0)
      .addCharFilter(HTMLStripCharFilterFactory.class)
      .withTokenizer(ClassicTokenizerFactory.class)
      .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true")
      .addTokenFilter(LowerCaseFilterFactory.class)
      .withPositionIncrementGap(100)
      .withOffsetGap(1000)
      .build();
  
  assertSame(ClassicTokenizerFactory.class, a.getTokenizerFactory().getClass());
  List<CharFilterFactory> charFilters = a.getCharFilterFactories();
  assertEquals(1, charFilters.size());
  assertEquals(HTMLStripCharFilterFactory.class, charFilters.get(0).getClass());
  List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories();
  assertEquals(2, tokenFilters.size());
  assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass());
  assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass());
  assertEquals(100, a.getPositionIncrementGap("dummy"));
  assertEquals(1000, a.getOffsetGap("dummy"));
  assertSame(LUCENE_8_0_0, a.getVersion());

  assertAnalyzesTo(a, "<p>foo bar</p> FOO BAR", 
      new String[] { "foo", "bar", "foo", "bar" },
      new int[]    { 1,     1,     1,     1});
  assertAnalyzesTo(a, "<p><b>föó</b> bär     FÖÖ BAR</p>", 
      new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" },
      new int[]    { 1,     0,     1,     0,     1,     0,     1});
  a.close();
}

Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

/** Builds the analyzer. */
public CustomAnalyzer build() {
  if (tokenizer.get() == null) {
    throw new IllegalStateException("You have to set at least a tokenizer.");
  }
  return new CustomAnalyzer(
    defaultMatchVersion.get(),
    charFilters.toArray(new CharFilterFactory[charFilters.size()]),
    tokenizer.get(), 
    tokenFilters.toArray(new TokenFilterFactory[tokenFilters.size()]),
    posIncGap.get(),
    offsetGap.get()
  );
}

Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

/** Adds the given char filter.
 * @param name is used to look up the factory with {@link CharFilterFactory#forName(String, Map)}.
 *  The list of possible names can be looked up with {@link CharFilterFactory#availableCharFilters()}.
 * @param params the map of parameters to be passed to factory. The map must be modifiable.
 */
public Builder addCharFilter(String name, Map<String,String> params) throws IOException {
  Objects.requireNonNull(name, "CharFilter name may not be null");
  charFilters.add(applyResourceLoader(CharFilterFactory.forName(name, applyDefaultParams(params))));
  componentsAdded = true;
  return this;
}

Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

/** Adds the given char filter.
 * @param factory class that is used to create the char filter.
 * @param params the map of parameters to be passed to factory. The map must be modifiable.
 */
public Builder addCharFilter(Class<? extends CharFilterFactory> factory, Map<String,String> params) throws IOException {
  Objects.requireNonNull(factory, "CharFilter name may not be null");
  charFilters.add(applyResourceLoader(newFactoryClassInstance(factory, applyDefaultParams(params))));
  componentsAdded = true;
  return this;
}

Source File: PluginsService.java From Elasticsearch with Apache License 2.0

5 votes

/**
 * Reloads all Lucene SPI implementations using the new classloader.
 * This method must be called after the new classloader has been created to
 * register the services for use.
 */
static void reloadLuceneSPI(ClassLoader loader) {
    // do NOT change the order of these method calls!

    // Codecs:
    PostingsFormat.reloadPostingsFormats(loader);
    DocValuesFormat.reloadDocValuesFormats(loader);
    Codec.reloadCodecs(loader);
    // Analysis:
    CharFilterFactory.reloadCharFilters(loader);
    TokenFilterFactory.reloadTokenFilters(loader);
    TokenizerFactory.reloadTokenizers(loader);
}

Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected Reader initReader(String fieldName, Reader reader) {
  for (final CharFilterFactory charFilter : charFilters) {
    reader = charFilter.create(reader);
  }
  return reader;
}

Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

CustomAnalyzer(Version defaultMatchVersion, CharFilterFactory[] charFilters, TokenizerFactory tokenizer, TokenFilterFactory[] tokenFilters, Integer posIncGap, Integer offsetGap) {
  this.charFilters = charFilters;
  this.tokenizer = tokenizer;
  this.tokenFilters = tokenFilters;
  this.posIncGap = posIncGap;
  this.offsetGap = offsetGap;
  if (defaultMatchVersion != null) {
    setVersion(defaultMatchVersion);
  }
}

Source File: AnalyzerFactoryTask.java From lucene-solr with Apache License 2.0

5 votes

/**
 * This method looks up a class with its fully qualified name (FQN), or a short-name
 * class-simplename, or with a package suffix, assuming "org.apache.lucene.analysis."
 * as the package prefix (e.g. "standard.ClassicTokenizerFactory" -&gt;
 * "org.apache.lucene.analysis.standard.ClassicTokenizerFactory").
 *
 * If className contains a period, the class is first looked up as-is, assuming that it
 * is an FQN.  If this fails, lookup is retried after prepending the Lucene analysis
 * package prefix to the class name.
 *
 * If className does not contain a period, the analysis SPI *Factory.lookupClass()
 * methods are used to find the class.
 *
 * @param className The name or the short name of the class.
 * @param expectedType The superclass className is expected to extend
 * @return the loaded class.
 * @throws ClassNotFoundException if lookup fails
 */
public <T> Class<? extends T> lookupAnalysisClass(String className, Class<T> expectedType)
    throws ClassNotFoundException {
  if (className.contains(".")) {
    try {
      // First, try className == FQN
      return Class.forName(className).asSubclass(expectedType);
    } catch (ClassNotFoundException e) {
      try {
        // Second, retry lookup after prepending the Lucene analysis package prefix
        return Class.forName(LUCENE_ANALYSIS_PACKAGE_PREFIX + className).asSubclass(expectedType);
      } catch (ClassNotFoundException e1) {
        throw new ClassNotFoundException("Can't find class '" + className
                                         + "' or '" + LUCENE_ANALYSIS_PACKAGE_PREFIX + className + "'");
      }
    }
  }
  // No dot - use analysis SPI lookup
  final String analysisComponentName = ANALYSIS_COMPONENT_SUFFIX_PATTERN.matcher(className).replaceFirst("");
  if (CharFilterFactory.class.isAssignableFrom(expectedType)) {
    return CharFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
  } else if (TokenizerFactory.class.isAssignableFrom(expectedType)) {
    return TokenizerFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
  } else if (TokenFilterFactory.class.isAssignableFrom(expectedType)) {
    return TokenFilterFactory.lookupClass(analysisComponentName).asSubclass(expectedType);
  }

  throw new ClassNotFoundException("Can't find class '" + className + "'");
}

Source File: TestFactories.java From lucene-solr with Apache License 2.0

5 votes

private void doTestCharFilter(String charfilter) throws IOException {
  Class<? extends CharFilterFactory> factoryClazz = CharFilterFactory.lookupClass(charfilter);
  CharFilterFactory factory = (CharFilterFactory) initialize(factoryClazz);
  if (factory != null) {
    // we managed to fully create an instance. check a few more things:
    if (!EXCLUDE_FACTORIES_RANDOM_DATA.contains(factory.getClass())) {
      // beast it just a little, it shouldnt throw exceptions:
      // (it should have thrown them in initialize)
      Analyzer a = new FactoryAnalyzer(assertingTokenizer, null, factory);
      checkRandomData(random(), a, 3, 20, false, false);
      a.close();
    }
  }
}

Source File: TestFactories.java From lucene-solr with Apache License 2.0

5 votes

public void test() throws IOException {
  for (String tokenizer : TokenizerFactory.availableTokenizers()) {
    doTestTokenizer(tokenizer);
  }
  
  for (String tokenFilter : TokenFilterFactory.availableTokenFilters()) {
    doTestTokenFilter(tokenFilter);
  }
  
  for (String charFilter : CharFilterFactory.availableCharFilters()) {
    doTestCharFilter(charFilter);
  }
}

Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

@Override
protected Reader initReaderForNormalization(String fieldName, Reader reader) {
  for (CharFilterFactory charFilter : charFilters) {
    reader = charFilter.normalize(reader);
  }
  return reader;
}

Source File: AnalyzerFactory.java From lucene-solr with Apache License 2.0

5 votes

public AnalyzerFactory(List<CharFilterFactory> charFilterFactories,
                       TokenizerFactory tokenizerFactory,
                       List<TokenFilterFactory> tokenFilterFactories) {
  this.charFilterFactories = charFilterFactories;
  assert null != tokenizerFactory;
  this.tokenizerFactory = tokenizerFactory;
  this.tokenFilterFactories = tokenFilterFactories;
}

Source File: AnalyzerFactory.java From lucene-solr with Apache License 2.0

5 votes

@Override
public String toString() {
  StringBuilder sb = new StringBuilder("AnalyzerFactory(");
  if (null != name) {
    sb.append("name:");
    sb.append(name);
    sb.append(", ");
  }
  if (null != positionIncrementGap) {
    sb.append("positionIncrementGap:");
    sb.append(positionIncrementGap);
    sb.append(", ");
  }
  if (null != offsetGap) {
    sb.append("offsetGap:");
    sb.append(offsetGap);
    sb.append(", ");
  }
  for (CharFilterFactory charFilterFactory: charFilterFactories) {
    sb.append(charFilterFactory);
    sb.append(", ");
  }
  sb.append(tokenizerFactory);
  for (TokenFilterFactory tokenFilterFactory : tokenFilterFactories) {
    sb.append(", ");
    sb.append(tokenFilterFactory);
  }
  sb.append(')');
  return sb.toString();
}

Source File: AnalysisImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
public Collection<String> getAvailableCharFilters() {
  return CharFilterFactory.availableCharFilters().stream().sorted().collect(Collectors.toList());
}

Source File: AlfrescoFieldType.java From SearchServices with GNU Lesser General Public License v3.0

4 votes

public TokenizerChain build()
{
    CharFilterFactory[] charFilterArr = charFilters == null ? null : charFilters.toArray(new CharFilterFactory[charFilters.size()]);
    TokenFilterFactory[] filterArr = filters == null ? new TokenFilterFactory[0] : filters.toArray(new TokenFilterFactory[filters.size()]);
    return new TokenizerChain(charFilterArr, tokenizer, filterArr);
}

Source File: TokenizerChain.java From lucene-solr with Apache License 2.0

4 votes

/** @return array of CharFilterFactories, may be empty but never null */
public CharFilterFactory[] getCharFilterFactories() { return charFilters; }

Source File: TestFactories.java From lucene-solr with Apache License 2.0

4 votes

FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) {
  assert tokenizer != null;
  this.tokenizer = tokenizer;
  this.charFilter = charFilter;
  this.tokenfilter = tokenfilter;
}

Source File: AnalysisChainDialogFactory.java From lucene-solr with Apache License 2.0

4 votes

private JPanel analysisChain() {
  JPanel panel = new JPanel(new GridBagLayout());
  panel.setOpaque(false);

  GridBagConstraints c = new GridBagConstraints();
  c.fill = GridBagConstraints.HORIZONTAL;
  c.insets = new Insets(5, 5, 5, 5);

  c.gridx = 0;
  c.gridy = 0;
  c.weightx = 0.1;
  c.weighty = 0.5;
  panel.add(new JLabel(MessageUtils.getLocalizedMessage("analysis.dialog.chain.label.charfilters")), c);

  String[] charFilters = analyzer.getCharFilterFactories().stream().map(f -> CharFilterFactory.findSPIName(f.getClass())).toArray(String[]::new);
  JList<String> charFilterList = new JList<>(charFilters);
  charFilterList.setVisibleRowCount(charFilters.length == 0 ? 1 : Math.min(charFilters.length, 5));
  c.gridx = 1;
  c.gridy = 0;
  c.weightx = 0.5;
  c.weighty = 0.5;
  panel.add(new JScrollPane(charFilterList), c);

  c.gridx = 0;
  c.gridy = 1;
  c.weightx = 0.1;
  c.weighty = 0.1;
  panel.add(new JLabel(MessageUtils.getLocalizedMessage("analysis.dialog.chain.label.tokenizer")), c);

  String tokenizer = TokenizerFactory.findSPIName(analyzer.getTokenizerFactory().getClass());
  JTextField tokenizerTF = new JTextField(tokenizer);
  tokenizerTF.setColumns(30);
  tokenizerTF.setEditable(false);
  tokenizerTF.setPreferredSize(new Dimension(300, 25));
  tokenizerTF.setBorder(BorderFactory.createLineBorder(Color.gray));
  c.gridx = 1;
  c.gridy = 1;
  c.weightx = 0.5;
  c.weighty = 0.1;
  panel.add(tokenizerTF, c);

  c.gridx = 0;
  c.gridy = 2;
  c.weightx = 0.1;
  c.weighty = 0.5;
  panel.add(new JLabel(MessageUtils.getLocalizedMessage("analysis.dialog.chain.label.tokenfilters")), c);

  String[] tokenFilters = analyzer.getTokenFilterFactories().stream().map(f -> TokenFilterFactory.findSPIName(f.getClass())).toArray(String[]::new);
  JList<String> tokenFilterList = new JList<>(tokenFilters);
  tokenFilterList.setVisibleRowCount(tokenFilters.length == 0 ? 1 : Math.min(tokenFilters.length, 5));
  tokenFilterList.setMinimumSize(new Dimension(300, 25));
  c.gridx = 1;
  c.gridy = 2;
  c.weightx = 0.5;
  c.weighty = 0.5;
  panel.add(new JScrollPane(tokenFilterList), c);

  return panel;
}

Source File: TestFactories.java From lucene-solr with Apache License 2.0

4 votes

FactoryAnalyzer(TokenizerFactory tokenizer, TokenFilterFactory tokenfilter, CharFilterFactory charFilter) {
  assert tokenizer != null;
  this.tokenizer = tokenizer;
  this.charFilter = charFilter;
  this.tokenfilter = tokenfilter;
}

Source File: CustomAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

/** Returns the list of char filters that are used in this analyzer. */
public List<CharFilterFactory> getCharFilterFactories() {
  return Collections.unmodifiableList(Arrays.asList(charFilters));
}

org.apache.lucene.analysis.util.CharFilterFactory Java Examples