org.apache.lucene.analysis.util.CharArraySet Java Exaples

Source File: SnowballAnalyzerBuilder.java From stratio-cassandra with Apache License 2.0

6 votes

/**
 * Builds a new {@link SnowballAnalyzerBuilder} for the specified language and stopwords.
 *
 * @param language  The language. The supported languages are English, French, Spanish, Portuguese, Italian,
 *                  Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian,
 *                  Turkish, Armenian, Basque and Catalan.
 * @param stopwords The comma separated stopwords {@code String}.
 */
@JsonCreator
public SnowballAnalyzerBuilder(@JsonProperty("language") final String language,
                               @JsonProperty("stopwords") String stopwords) {

    // Check language
    if (language == null || language.trim().isEmpty()) {
        throw new IllegalArgumentException("Language must be specified");
    }

    // Setup stopwords
    CharArraySet stops = stopwords == null ? getDefaultStopwords(language) : getStopwords(stopwords);

    // Setup analyzer
    this.analyzer = buildAnalyzer(language, stops);

    // Force analysis validation
    AnalysisUtils.analyzeAsText("test", analyzer);
}

Source File: Analysis.java From Elasticsearch with Apache License 2.0

6 votes

public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) {
    String value = settings.get("stem_exclusion");
    if (value != null) {
        if ("_none_".equals(value)) {
            return CharArraySet.EMPTY_SET;
        } else {
            // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)?
            return new CharArraySet(Strings.commaDelimitedListToSet(value), false);
        }
    }
    String[] stemExclusion = settings.getAsArray("stem_exclusion", null);
    if (stemExclusion != null) {
        // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)?
        return new CharArraySet(Arrays.asList(stemExclusion), false);
    } else {
        return defaultStemExclusion;
    }
}

Source File: QueryParserImpl.java From AdSearch_Endpoints with Apache License 2.0

6 votes

@Override
  public List<String> parseQuery(String queryStr) {
    // tokenize queryStr, remove stop word, stemming
	List<String> tokens = new ArrayList<String>();
	AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY;
	Tokenizer tokenizer = new StandardTokenizer(factory);
	tokenizer.setReader(new StringReader(queryStr));
	CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet();
    TokenStream tokenStream = new StopFilter(tokenizer, stopWords);
//    StringBuilder sb = new StringBuilder();
    CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class);
    try {
    	tokenStream.reset();
        while (tokenStream.incrementToken()) {
            String term = charTermAttribute.toString();
            
            tokens.add(term);
//            sb.append(term + " ");
        }
        tokenStream.end();
        tokenStream.close();

        tokenizer.close();  
	} catch (IOException e) {
		e.printStackTrace();
	}
//	System.out.println("QU="+ sb.toString());
	return tokens;	
  }

Source File: StandardAnalyzerProvider.java From Elasticsearch with Apache License 2.0

6 votes

public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) {
    super(index, indexSettings, name, settings);
    this.esVersion = Version.indexCreated(indexSettings);
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }

    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);
    int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH);
    standardAnalyzer = new StandardAnalyzer(stopWords);
    standardAnalyzer.setVersion(version);
    standardAnalyzer.setMaxTokenLength(maxTokenLength);
}

Source File: PatternAnalyzerProvider.java From Elasticsearch with Apache License 2.0

6 votes

@Inject
public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    Version esVersion = Version.indexCreated(indexSettingsService.getSettings());
    final CharArraySet defaultStopwords;
    if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) {
        defaultStopwords = CharArraySet.EMPTY_SET;
    } else {
        defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET;
    }
    boolean lowercase = settings.getAsBoolean("lowercase", true);
    CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords);

    String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/);
    if (sPattern == null) {
        throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set");
    }
    Pattern pattern = Regex.compile(sPattern, settings.get("flags"));

    analyzer = new PatternAnalyzer(pattern, lowercase, stopWords);
}

Source File: KuromojiUDF.java From incubator-hivemall with Apache License 2.0

5 votes

@Override
public Object evaluate(DeferredObject[] arguments) throws HiveException {
    if (_analyzer == null) {
        CharArraySet stopWords = stopWords(_stopWordsArray);

        UserDictionary userDict = null;
        if (_userDictObj instanceof String[]) {
            userDict = userDictionary((String[]) _userDictObj);
        } else if (_userDictObj instanceof String) {
            userDict = userDictionary((String) _userDictObj);
        }

        this._analyzer = new JapaneseAnalyzer(userDict, _mode, stopWords, _stopTags);
    }

    Object arg0 = arguments[0].get();
    if (arg0 == null) {
        return null;
    }
    String line = arg0.toString();

    if (_returnPos) {
        return parseLine(_analyzer, line, _result);
    } else {
        return parseLine(_analyzer, line);
    }
}

Source File: Analysis.java From Elasticsearch with Apache License 2.0

5 votes

public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, Map<String, Set<?>> namedWords, boolean ignoreCase) {
    String value = settings.get(name);
    if (value != null) {
        if ("_none_".equals(value)) {
            return CharArraySet.EMPTY_SET;
        } else {
            return resolveNamedWords(Strings.commaDelimitedListToSet(value), namedWords, ignoreCase);
        }
    }
    List<String> pathLoadedWords = getWordList(env, settings, name);
    if (pathLoadedWords != null) {
        return resolveNamedWords(pathLoadedWords, namedWords, ignoreCase);
    }
    return defaultWords;
}

Source File: SnowballAnalyzerBuilder.java From stratio-cassandra with Apache License 2.0

5 votes

/**
 * Returns the stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}.
 *
 * @param stopwords A {@code String} comma separated stopwords list.
 * @return The stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}.
 */
private static CharArraySet getStopwords(String stopwords) {
    List<String> stopwordsList = new ArrayList<>();
    for (String stop : stopwords.split(",")) {
        stopwordsList.add(stop.trim());
    }
    return new CharArraySet(stopwordsList, true);
}

Source File: NeedsConfiguringAnalyzerFactory.java From database with GNU General Public License v2.0

5 votes

/**
 * This is called only when we have already identified that
 * the class does support stopwords.
 * @return
 */
public Set<?> getStopWords() {
	
	if (doNotUseStopWords()) 
		return CharArraySet.EMPTY_SET;
	
	if (useDefaultStopWords()) {
		return getStopWordsForClass(className);
	}
	
	return getStopWordsForClass(stopwords);
}

Source File: NeedsConfiguringAnalyzerFactory.java From database with GNU General Public License v2.0

5 votes

public PatternAnalyzer(ConfigOptionsToAnalyzer lro, Pattern pattern, CharArraySet stopWords) throws Exception {
	/*
	super(lro.languageRange, getConstructor(PatternAnalyzerImpl.class,Pattern.class, CharArraySet.class), 
		pattern, stopWords);
		*/
	super(lro.languageRange, new PatternAnalyzerImpl(pattern, stopWords), new PatternAnalyzerImpl(pattern, CharArraySet.EMPTY_SET));
}

Source File: NeedsConfiguringAnalyzerFactory.java From database with GNU General Public License v2.0

5 votes

private static Object[] useEmptyStopWordSet(Object[] params) {
	Object rslt[] = new Object[params.length];
	for (int i=0; i<params.length; i++) {
		if (params[i] instanceof Set) {
			rslt[i] = CharArraySet.EMPTY_SET;
		} else {
			rslt[i] = params[i];
		}
	}
	return rslt;
}

Source File: LanguageAnalyzer.java From modernmt with Apache License 2.0

5 votes

protected LanguageAnalyzer(AnalyzerConfig config, CharArraySet defaultStopWordsSet) {
    super(config.stopWordsSet == null ? defaultStopWordsSet : config.stopWordsSet);

    if (config.enableStemming && config.stemmingExclusionSet != null)
        this.stemmingExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(config.stemmingExclusionSet));
    else
        this.stemmingExclusionSet = null;

    this.config = config;
}

Source File: KuromojiUDF.java From incubator-hivemall with Apache License 2.0

5 votes

@Nonnull
private static CharArraySet stopWords(@Nullable final String[] array)
        throws UDFArgumentException {
    if (array == null) {
        return JapaneseAnalyzer.getDefaultStopSet();
    }
    if (array.length == 0) {
        return CharArraySet.EMPTY_SET;
    }
    return new CharArraySet(Arrays.asList(array), /* ignoreCase */true);
}

Source File: CommonMMSeg4jSegmenter.java From linden with Apache License 2.0

5 votes

private void initStopWords(String stopWordsPath) {
  if (stopWordsPath != null) {
    try {
      List<String> lines = FileUtils.readLines(new File(stopWordsPath));
      Set<String> set = new HashSet<>(lines);
      stopWords = CharArraySet.copy(set);
    } catch (IOException e) {
      throw new RuntimeException("Read stop words failed path : " + stopWordsPath);
    }
  }
}

Source File: KeywordMarkerTokenFilterFactory.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public KeywordMarkerTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);

    boolean ignoreCase = settings.getAsBoolean("ignore_case", false);
    Set<?> rules = Analysis.getWordSet(env, settings, "keywords");
    if (rules == null) {
        throw new IllegalArgumentException("keyword filter requires either `keywords` or `keywords_path` to be configured");
    }
    keywordLookup = new CharArraySet(rules, ignoreCase);
}

Source File: BasqueAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public BasqueAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new BasqueAnalyzer(Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: IrishAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public IrishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new IrishAnalyzer(Analysis.parseStopWords(env, settings, IrishAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: LindenStandardAnalyzerFactory.java From linden with Apache License 2.0

5 votes

@Override
public StandardAnalyzer getInstance(Map<String, String> params) throws IOException {
  if (params.containsKey(STOPWORDS_EMPTY)) {
    if (Boolean.parseBoolean(params.get(STOPWORDS_EMPTY))) {
      return new StandardAnalyzer(CharArraySet.EMPTY_SET);
    }
  }
  return new StandardAnalyzer();
}

Source File: Analysis.java From Elasticsearch with Apache License 2.0

5 votes

public static CharArraySet getWordSet(Environment env, Settings settings, String settingsPrefix) {
    List<String> wordList = getWordList(env, settings, settingsPrefix);
    if (wordList == null) {
        return null;
    }
    return new CharArraySet(wordList, settings.getAsBoolean(settingsPrefix + "_case", false));
}

Source File: Analysis.java From Elasticsearch with Apache License 2.0

5 votes

private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) {
    if (namedWords == null) {
        return new CharArraySet(words, ignoreCase);
    }
    CharArraySet setWords = new CharArraySet(words.size(), ignoreCase);
    for (String word : words) {
        if (namedWords.containsKey(word)) {
            setWords.addAll(namedWords.get(word));
        } else {
            setWords.add(word);
        }
    }
    return setWords;
}

Source File: LatvianAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public LatvianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new LatvianAnalyzer(Analysis.parseStopWords(env, settings, LatvianAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: LithuanianAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public LithuanianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new LithuanianAnalyzer(Analysis.parseStopWords(env, settings, LithuanianAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: HindiAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public HindiAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new HindiAnalyzer(Analysis.parseStopWords(env, settings, HindiAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: SoraniAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public SoraniAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new SoraniAnalyzer(Analysis.parseStopWords(env, settings, SoraniAnalyzer.getDefaultStopSet()),
                                  Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: CatalanAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public CatalanAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new CatalanAnalyzer(Analysis.parseStopWords(env, settings, CatalanAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: TurkishAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public TurkishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new TurkishAnalyzer(Analysis.parseStopWords(env, settings, TurkishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: BulgarianAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public BulgarianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new BulgarianAnalyzer(Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet()),
                                     Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: RussianAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public RussianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new RussianAnalyzer(Analysis.parseStopWords(env, settings, RussianAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: DutchAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public DutchAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new DutchAnalyzer(Analysis.parseStopWords(env, settings, DutchAnalyzer.getDefaultStopSet()),
                                 Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

Source File: FinnishAnalyzerProvider.java From Elasticsearch with Apache License 2.0

5 votes

@Inject
public FinnishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) {
    super(index, indexSettingsService.getSettings(), name, settings);
    analyzer = new FinnishAnalyzer(Analysis.parseStopWords(env, settings, FinnishAnalyzer.getDefaultStopSet()),
                                   Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET));
    analyzer.setVersion(version);
}

org.apache.lucene.analysis.util.CharArraySet Java Examples