org.apache.lucene.analysis.util.CharArraySet Java Examples
The following examples show how to use
org.apache.lucene.analysis.util.CharArraySet.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SnowballAnalyzerBuilder.java From stratio-cassandra with Apache License 2.0 | 6 votes |
/** * Builds a new {@link SnowballAnalyzerBuilder} for the specified language and stopwords. * * @param language The language. The supported languages are English, French, Spanish, Portuguese, Italian, * Romanian, German, Dutch, Swedish, Norwegian, Danish, Russian, Finnish, Irish, Hungarian, * Turkish, Armenian, Basque and Catalan. * @param stopwords The comma separated stopwords {@code String}. */ @JsonCreator public SnowballAnalyzerBuilder(@JsonProperty("language") final String language, @JsonProperty("stopwords") String stopwords) { // Check language if (language == null || language.trim().isEmpty()) { throw new IllegalArgumentException("Language must be specified"); } // Setup stopwords CharArraySet stops = stopwords == null ? getDefaultStopwords(language) : getStopwords(stopwords); // Setup analyzer this.analyzer = buildAnalyzer(language, stops); // Force analysis validation AnalysisUtils.analyzeAsText("test", analyzer); }
Example #2
Source File: Analysis.java From Elasticsearch with Apache License 2.0 | 6 votes |
public static CharArraySet parseStemExclusion(Settings settings, CharArraySet defaultStemExclusion) { String value = settings.get("stem_exclusion"); if (value != null) { if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } else { // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)? return new CharArraySet(Strings.commaDelimitedListToSet(value), false); } } String[] stemExclusion = settings.getAsArray("stem_exclusion", null); if (stemExclusion != null) { // LUCENE 4 UPGRADE: Should be settings.getAsBoolean("stem_exclusion_case", false)? return new CharArraySet(Arrays.asList(stemExclusion), false); } else { return defaultStemExclusion; } }
Example #3
Source File: QueryParserImpl.java From AdSearch_Endpoints with Apache License 2.0 | 6 votes |
@Override public List<String> parseQuery(String queryStr) { // tokenize queryStr, remove stop word, stemming List<String> tokens = new ArrayList<String>(); AttributeFactory factory = AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY; Tokenizer tokenizer = new StandardTokenizer(factory); tokenizer.setReader(new StringReader(queryStr)); CharArraySet stopWords = EnglishAnalyzer.getDefaultStopSet(); TokenStream tokenStream = new StopFilter(tokenizer, stopWords); // StringBuilder sb = new StringBuilder(); CharTermAttribute charTermAttribute = tokenizer.addAttribute(CharTermAttribute.class); try { tokenStream.reset(); while (tokenStream.incrementToken()) { String term = charTermAttribute.toString(); tokens.add(term); // sb.append(term + " "); } tokenStream.end(); tokenStream.close(); tokenizer.close(); } catch (IOException e) { e.printStackTrace(); } // System.out.println("QU="+ sb.toString()); return tokens; }
Example #4
Source File: StandardAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 6 votes |
public StandardAnalyzerProvider(Index index, Settings indexSettings, Environment env, String name, Settings settings) { super(index, indexSettings, name, settings); this.esVersion = Version.indexCreated(indexSettings); final CharArraySet defaultStopwords; if (esVersion.onOrAfter(Version.V_1_0_0_Beta1)) { defaultStopwords = CharArraySet.EMPTY_SET; } else { defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords); int maxTokenLength = settings.getAsInt("max_token_length", StandardAnalyzer.DEFAULT_MAX_TOKEN_LENGTH); standardAnalyzer = new StandardAnalyzer(stopWords); standardAnalyzer.setVersion(version); standardAnalyzer.setMaxTokenLength(maxTokenLength); }
Example #5
Source File: PatternAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 6 votes |
@Inject public PatternAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); Version esVersion = Version.indexCreated(indexSettingsService.getSettings()); final CharArraySet defaultStopwords; if (esVersion.onOrAfter(Version.V_1_0_0_RC1)) { defaultStopwords = CharArraySet.EMPTY_SET; } else { defaultStopwords = StopAnalyzer.ENGLISH_STOP_WORDS_SET; } boolean lowercase = settings.getAsBoolean("lowercase", true); CharArraySet stopWords = Analysis.parseStopWords(env, settings, defaultStopwords); String sPattern = settings.get("pattern", "\\W+" /*PatternAnalyzer.NON_WORD_PATTERN*/); if (sPattern == null) { throw new IllegalArgumentException("Analyzer [" + name + "] of type pattern must have a `pattern` set"); } Pattern pattern = Regex.compile(sPattern, settings.get("flags")); analyzer = new PatternAnalyzer(pattern, lowercase, stopWords); }
Example #6
Source File: KuromojiUDF.java From incubator-hivemall with Apache License 2.0 | 5 votes |
@Override public Object evaluate(DeferredObject[] arguments) throws HiveException { if (_analyzer == null) { CharArraySet stopWords = stopWords(_stopWordsArray); UserDictionary userDict = null; if (_userDictObj instanceof String[]) { userDict = userDictionary((String[]) _userDictObj); } else if (_userDictObj instanceof String) { userDict = userDictionary((String) _userDictObj); } this._analyzer = new JapaneseAnalyzer(userDict, _mode, stopWords, _stopTags); } Object arg0 = arguments[0].get(); if (arg0 == null) { return null; } String line = arg0.toString(); if (_returnPos) { return parseLine(_analyzer, line, _result); } else { return parseLine(_analyzer, line); } }
Example #7
Source File: Analysis.java From Elasticsearch with Apache License 2.0 | 5 votes |
public static CharArraySet parseWords(Environment env, Settings settings, String name, CharArraySet defaultWords, Map<String, Set<?>> namedWords, boolean ignoreCase) { String value = settings.get(name); if (value != null) { if ("_none_".equals(value)) { return CharArraySet.EMPTY_SET; } else { return resolveNamedWords(Strings.commaDelimitedListToSet(value), namedWords, ignoreCase); } } List<String> pathLoadedWords = getWordList(env, settings, name); if (pathLoadedWords != null) { return resolveNamedWords(pathLoadedWords, namedWords, ignoreCase); } return defaultWords; }
Example #8
Source File: SnowballAnalyzerBuilder.java From stratio-cassandra with Apache License 2.0 | 5 votes |
/** * Returns the stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}. * * @param stopwords A {@code String} comma separated stopwords list. * @return The stopwords {@link CharArraySet} for the specified comma separated stopwords {@code String}. */ private static CharArraySet getStopwords(String stopwords) { List<String> stopwordsList = new ArrayList<>(); for (String stop : stopwords.split(",")) { stopwordsList.add(stop.trim()); } return new CharArraySet(stopwordsList, true); }
Example #9
Source File: NeedsConfiguringAnalyzerFactory.java From database with GNU General Public License v2.0 | 5 votes |
/** * This is called only when we have already identified that * the class does support stopwords. * @return */ public Set<?> getStopWords() { if (doNotUseStopWords()) return CharArraySet.EMPTY_SET; if (useDefaultStopWords()) { return getStopWordsForClass(className); } return getStopWordsForClass(stopwords); }
Example #10
Source File: NeedsConfiguringAnalyzerFactory.java From database with GNU General Public License v2.0 | 5 votes |
public PatternAnalyzer(ConfigOptionsToAnalyzer lro, Pattern pattern, CharArraySet stopWords) throws Exception { /* super(lro.languageRange, getConstructor(PatternAnalyzerImpl.class,Pattern.class, CharArraySet.class), pattern, stopWords); */ super(lro.languageRange, new PatternAnalyzerImpl(pattern, stopWords), new PatternAnalyzerImpl(pattern, CharArraySet.EMPTY_SET)); }
Example #11
Source File: NeedsConfiguringAnalyzerFactory.java From database with GNU General Public License v2.0 | 5 votes |
private static Object[] useEmptyStopWordSet(Object[] params) { Object rslt[] = new Object[params.length]; for (int i=0; i<params.length; i++) { if (params[i] instanceof Set) { rslt[i] = CharArraySet.EMPTY_SET; } else { rslt[i] = params[i]; } } return rslt; }
Example #12
Source File: LanguageAnalyzer.java From modernmt with Apache License 2.0 | 5 votes |
protected LanguageAnalyzer(AnalyzerConfig config, CharArraySet defaultStopWordsSet) { super(config.stopWordsSet == null ? defaultStopWordsSet : config.stopWordsSet); if (config.enableStemming && config.stemmingExclusionSet != null) this.stemmingExclusionSet = CharArraySet.unmodifiableSet(CharArraySet.copy(config.stemmingExclusionSet)); else this.stemmingExclusionSet = null; this.config = config; }
Example #13
Source File: KuromojiUDF.java From incubator-hivemall with Apache License 2.0 | 5 votes |
@Nonnull private static CharArraySet stopWords(@Nullable final String[] array) throws UDFArgumentException { if (array == null) { return JapaneseAnalyzer.getDefaultStopSet(); } if (array.length == 0) { return CharArraySet.EMPTY_SET; } return new CharArraySet(Arrays.asList(array), /* ignoreCase */true); }
Example #14
Source File: CommonMMSeg4jSegmenter.java From linden with Apache License 2.0 | 5 votes |
private void initStopWords(String stopWordsPath) { if (stopWordsPath != null) { try { List<String> lines = FileUtils.readLines(new File(stopWordsPath)); Set<String> set = new HashSet<>(lines); stopWords = CharArraySet.copy(set); } catch (IOException e) { throw new RuntimeException("Read stop words failed path : " + stopWordsPath); } } }
Example #15
Source File: KeywordMarkerTokenFilterFactory.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public KeywordMarkerTokenFilterFactory(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); boolean ignoreCase = settings.getAsBoolean("ignore_case", false); Set<?> rules = Analysis.getWordSet(env, settings, "keywords"); if (rules == null) { throw new IllegalArgumentException("keyword filter requires either `keywords` or `keywords_path` to be configured"); } keywordLookup = new CharArraySet(rules, ignoreCase); }
Example #16
Source File: BasqueAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public BasqueAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new BasqueAnalyzer(Analysis.parseStopWords(env, settings, BasqueAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #17
Source File: IrishAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public IrishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new IrishAnalyzer(Analysis.parseStopWords(env, settings, IrishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #18
Source File: LindenStandardAnalyzerFactory.java From linden with Apache License 2.0 | 5 votes |
@Override public StandardAnalyzer getInstance(Map<String, String> params) throws IOException { if (params.containsKey(STOPWORDS_EMPTY)) { if (Boolean.parseBoolean(params.get(STOPWORDS_EMPTY))) { return new StandardAnalyzer(CharArraySet.EMPTY_SET); } } return new StandardAnalyzer(); }
Example #19
Source File: Analysis.java From Elasticsearch with Apache License 2.0 | 5 votes |
public static CharArraySet getWordSet(Environment env, Settings settings, String settingsPrefix) { List<String> wordList = getWordList(env, settings, settingsPrefix); if (wordList == null) { return null; } return new CharArraySet(wordList, settings.getAsBoolean(settingsPrefix + "_case", false)); }
Example #20
Source File: Analysis.java From Elasticsearch with Apache License 2.0 | 5 votes |
private static CharArraySet resolveNamedWords(Collection<String> words, Map<String, Set<?>> namedWords, boolean ignoreCase) { if (namedWords == null) { return new CharArraySet(words, ignoreCase); } CharArraySet setWords = new CharArraySet(words.size(), ignoreCase); for (String word : words) { if (namedWords.containsKey(word)) { setWords.addAll(namedWords.get(word)); } else { setWords.add(word); } } return setWords; }
Example #21
Source File: LatvianAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public LatvianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new LatvianAnalyzer(Analysis.parseStopWords(env, settings, LatvianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #22
Source File: LithuanianAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public LithuanianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new LithuanianAnalyzer(Analysis.parseStopWords(env, settings, LithuanianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #23
Source File: HindiAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public HindiAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new HindiAnalyzer(Analysis.parseStopWords(env, settings, HindiAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #24
Source File: SoraniAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public SoraniAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new SoraniAnalyzer(Analysis.parseStopWords(env, settings, SoraniAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #25
Source File: CatalanAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public CatalanAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new CatalanAnalyzer(Analysis.parseStopWords(env, settings, CatalanAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #26
Source File: TurkishAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public TurkishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new TurkishAnalyzer(Analysis.parseStopWords(env, settings, TurkishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #27
Source File: BulgarianAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public BulgarianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new BulgarianAnalyzer(Analysis.parseStopWords(env, settings, BulgarianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #28
Source File: RussianAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public RussianAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new RussianAnalyzer(Analysis.parseStopWords(env, settings, RussianAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #29
Source File: DutchAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public DutchAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new DutchAnalyzer(Analysis.parseStopWords(env, settings, DutchAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }
Example #30
Source File: FinnishAnalyzerProvider.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Inject public FinnishAnalyzerProvider(Index index, IndexSettingsService indexSettingsService, Environment env, @Assisted String name, @Assisted Settings settings) { super(index, indexSettingsService.getSettings(), name, settings); analyzer = new FinnishAnalyzer(Analysis.parseStopWords(env, settings, FinnishAnalyzer.getDefaultStopSet()), Analysis.parseStemExclusion(settings, CharArraySet.EMPTY_SET)); analyzer.setVersion(version); }