org.apache.lucene.analysis.core.StopAnalyzer Java Exaples

Source File: StandardAnalyzerProvider.java From Elasticsearch with Apache License 2.0

6 votes

public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}

Source File: PatternAnalyzerProvider.java From Elasticsearch with Apache License 2.0

6 votes

@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}

Source File: TestTextField.java From lucene-solr with Apache License 2.0

6 votes

@Test
public void testAnalyzeMultiTerm() {
  // No terms provided by the StopFilter (stop word) for the multi-term part.
  // This is supported. Check TextField.analyzeMultiTerm returns null (and does not throw an exception).
  BytesRef termBytes = TextField.analyzeMultiTerm("field", "the", new StopAnalyzer(EnglishAnalyzer.ENGLISH_STOP_WORDS_SET));
  assertNull(termBytes);

  // One term provided by the WhitespaceTokenizer for the multi-term part.
  // This is the regular case. Check TextField.analyzeMultiTerm returns it (and does not throw an exception).
  termBytes = TextField.analyzeMultiTerm("field", "Sol", new WhitespaceAnalyzer());
  assertEquals("Sol", termBytes.utf8ToString());

  // Two terms provided by the WhitespaceTokenizer for the multi-term part.
  // This is not allowed. Expect an exception.
  SolrException exception = expectThrows(SolrException.class, () -> TextField.analyzeMultiTerm("field", "term1 term2", new WhitespaceAnalyzer()));
  assertEquals("Unexpected error code", SolrException.ErrorCode.BAD_REQUEST.code, exception.code());
}

Source File: StandardHtmlStripAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public StandardHtmlStripAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env,  @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    analyzer = new StandardHtmlStripAnalyzer(stopWords);
    analyzer.setVersion(version);
}

Source File: StopAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public StopAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}

Source File: StopTokenFilterFactory.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public StopTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    this.ignoreCase = settings.getAsBoolean("ignore_case", false);
    this.removeTrailing = settings.getAsBoolean("remove_trailing", true);
    this.stopWords = Analysis.parseStopWords(env, settings, StopAnalyzer.ENGLISH_STOP_WORDS_SET, ignoreCase);
    if (version.onOrAfter(Version.LUCENE_4_4) && settings.get("enable_position_increments") != null) {
        throw new IllegalArgumentException("enable_position_increments is not supported anymore as of Lucene 4.4 as it can create broken token streams."
                + " Please fix your analysis chain or use an older compatibility version (<= 4.3).");
    }
    this.enablePositionIncrements = settings.getAsBoolean("enable_position_increments", true);
}

Source File: NeedsConfiguringAnalyzerFactory.java From database with GNU General Public License v2.0

5 votes

protected Set<?> getStopWordsForClass(String clazzName) {
	Class<? extends Analyzer> analyzerClass = getAnalyzerClass(clazzName);
	try {
		return (Set<?>) analyzerClass.getMethod("getDefaultStopSet").invoke(null);
	} catch (Exception e) {
		if (StandardAnalyzer.class.equals(analyzerClass)) {
			return StandardAnalyzer.STOP_WORDS_SET;
		}
		if (StopAnalyzer.class.equals(analyzerClass)) {
			return StopAnalyzer.ENGLISH_STOP_WORDS_SET;
		}
		throw new RuntimeException("Failed to find stop words from " + clazzName + " for language range "+languageRange);
	}
}

Source File: GraphUtil.java From SciGraph with Apache License 2.0

5 votes

/***
 * TODO: This and every spot that uses it is a bit of a hack
 * This should ideally be handled by the index.
 * @param value
 * @return
 */
public static boolean ignoreProperty(Object value) {
  if (value instanceof String
      && (CharMatcher.WHITESPACE.matchesAllOf((String) value)
          || StopAnalyzer.ENGLISH_STOP_WORDS_SET.contains(((String) value).toLowerCase()))) {
    return true;
  } 
  return false;
}

Source File: LuceneUtils.java From SciGraph with Apache License 2.0

5 votes

public static boolean isStopword(String word) {
  for (Iterator<?> stopWord = StopAnalyzer.ENGLISH_STOP_WORDS_SET.iterator(); stopWord.hasNext();) {
    String stopword = new String((char[]) stopWord.next());
    if (stopword.equalsIgnoreCase(word)) {
      return true;
    }
  }
  return false;
}

Source File: StopAnalyzerProvider.java From Elasticsearch with Apache License 2.0

4 votes

@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}

Source File: StandardHtmlStripAnalyzer.java From Elasticsearch with Apache License 2.0

4 votes

/**
 * @deprecated use {@link StandardHtmlStripAnalyzer#StandardHtmlStripAnalyzer(CharArraySet)} instead
 */
@Deprecated
public StandardHtmlStripAnalyzer() {
    super(StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}

Source File: DefaultAnalyzer.java From modernmt with Apache License 2.0

4 votes

protected DefaultAnalyzer(AnalyzerConfig config) {
    super(config, StopAnalyzer.ENGLISH_STOP_WORDS_SET);
}

Source File: PreBuiltAnalyzersTest.java From stratio-cassandra with Apache License 2.0

4 votes

@Test
public void testGetStop() {
    Analyzer analyzer = PreBuiltAnalyzers.STOP.get();
    Assert.assertEquals(StopAnalyzer.class, analyzer.getClass());
}

Source File: LuceneAnalyzerIntegrationTest.java From tutorials with MIT License

4 votes

@Test
public void whenUseStopAnalyzer_thenAnalyzed() throws IOException {
    List<String> result = analyze(SAMPLE_TEXT, new StopAnalyzer());

    assertThat(result, contains("baeldung", "com", "lucene", "analyzers", "test"));
}

Source File: StopAnalyzerProvider.java From crate with Apache License 2.0

4 votes

public StopAnalyzerProvider(IndexSettings indexSettings, Environment env, String name, Settings settings) {
    super(indexSettings, name, settings);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, EnglishAnalyzer.ENGLISH_STOP_WORDS_SET);
    this.stopAnalyzer = new StopAnalyzer(stopWords);
    this.stopAnalyzer.setVersion(version);
}

Source File: StopAnalyzerProvider.java From crate with Apache License 2.0

4 votes

@Override
public StopAnalyzer get() {
    return this.stopAnalyzer;
}

org.apache.lucene.analysis.core.StopAnalyzer Java Examples