org.apache.lucene.analysis.ngram.NGramTokenizer Java Examples
The following examples show how to use
org.apache.lucene.analysis.ngram.NGramTokenizer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Utils.java From fuzzy-matcher with Apache License 2.0 | 6 votes |
public static Stream<String> getNGrams(String value, int size) { Stream.Builder<String> stringStream = Stream.builder(); if (value.length() <= size) { stringStream.add(value); } else { NGramTokenizer nGramTokenizer = new NGramTokenizer(size, size); CharTermAttribute charTermAttribute = nGramTokenizer.addAttribute(CharTermAttribute.class); nGramTokenizer.setReader(new StringReader(value)); try { nGramTokenizer.reset(); while (nGramTokenizer.incrementToken()) { stringStream.add(charTermAttribute.toString()); } nGramTokenizer.end(); nGramTokenizer.close(); } catch (IOException io) { throw new MatchException("Failure in creating tokens : ", io); } } return stringStream.build(); }
Example #2
Source File: NGramTokenizationStrategy.java From datawave with Apache License 2.0 | 6 votes |
/** * Increments the tokenizer and returns the next n-gram in the stream, or null at some termination state, such as EOS. * * * @param tokenizer * The tokenizer responsible for generating the next available n-gram * @return the next n-gram in the stream, or null at some termination state, such as EOS */ protected String increment(final NGramTokenizer tokenizer) throws TokenizationException { String ngram = super.increment(tokenizer); if (null == ngram) { try { if ((null != tokenizer) && tokenizer.incrementToken()) { final CharTermAttribute charTermAttribute = tokenizer.getAttribute(CharTermAttribute.class); if (null != charTermAttribute) { ngram = charTermAttribute.toString(); charTermAttribute.resizeBuffer(0); } else { ngram = null; } } else { ngram = null; } } catch (final IOException e) { throw new TokenizationException("Could not get next n-gram from NGramTokenizer", e); } } return ngram; }
Example #3
Source File: TestICUNormalizer2CharFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTokenStream2() throws IOException { // '㌰', '<<'゙, '5', '℃', '№', '㈱', '㌘', 'サ', '<<', 'ソ', '<<' String input = "㌰゙5℃№㈱㌘ザゾ"; CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input), Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE)); Tokenizer tokenStream = new NGramTokenizer(newAttributeFactory(), 1, 1); tokenStream.setReader(reader); assertTokenStreamContents(tokenStream, new String[] {"ピ", "ゴ", "5", "°", "c", "n", "o", "(", "株", ")", "グ", "ラ", "ム", "ザ", "ゾ"}, new int[]{0, 1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9}, new int[]{1, 2, 3, 3, 4, 4, 5, 5, 5, 6, 6, 6, 7, 9, 11}, input.length() ); }
Example #4
Source File: TestConditionalTokenFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testConsistentOffsets() throws IOException { long seed = random().nextLong(); Analyzer analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer source = new NGramTokenizer(); TokenStream sink = new ValidatingTokenFilter(new KeywordRepeatFilter(source), "stage 0"); sink = new ValidatingTokenFilter(sink, "stage 1"); sink = new RandomSkippingFilter(sink, seed, in -> new TypeTokenFilter(in, Collections.singleton("word"))); sink = new ValidatingTokenFilter(sink, "last stage"); return new TokenStreamComponents(source, sink); } }; checkRandomData(random(), analyzer, 1); }
Example #5
Source File: CommonAnalysisPlugin.java From crate with Apache License 2.0 | 6 votes |
@Override public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() { List<PreConfiguredTokenizer> tokenizers = new ArrayList<>(); tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram", () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null)); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null)); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() { @Override public String name() { return "lowercase"; } @Override public TokenStream create(TokenStream tokenStream) { return new LowerCaseFilter(tokenStream); } })); // Temporary shim for aliases. TODO deprecate after they are moved tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null)); return tokenizers; }
Example #6
Source File: WeightedValuePruningStrategy.java From datawave with Apache License 2.0 | 5 votes |
@Override protected String increment(final NGramTokenizer tokenizer) throws TokenizationException { this.incrementCount++; final String ngram; if (this.incrementCount <= this.maxIncrementCount) { ngram = super.increment(tokenizer); } else { ngram = null; } return ngram; }
Example #7
Source File: AbstractNGramTokenizationStrategy.java From datawave with Apache License 2.0 | 5 votes |
/** * Increments the tokenizer and returns the next n-gram in the stream, or null if no n-gram was generated. * * @param tokenizer * The tokenizer responsible for generating the next available n-gram * @return the next n-gram in the stream, or null if no n-gram was generated */ protected String increment(final NGramTokenizer tokenizer) throws TokenizationException { final AbstractNGramTokenizationStrategy source = this.getSourceStrategy(); final String ngram; if (null != source) { ngram = source.increment(tokenizer); } else { ngram = null; } return ngram; }
Example #8
Source File: NGramLuceneQuery.java From onedev with MIT License | 5 votes |
private static PhraseQuery build(String fieldName, String fieldValue, int gramSize) { Preconditions.checkArgument(fieldValue.length()>=gramSize); PhraseQuery.Builder builder = new PhraseQuery.Builder(); try (NGramTokenizer tokenizer = new NGramTokenizer(gramSize, gramSize)) { tokenizer.setReader(new StringReader(fieldValue.toLowerCase())); tokenizer.reset(); while (tokenizer.incrementToken()) { builder.add(new Term(fieldName, tokenizer.getAttribute(CharTermAttribute.class).toString())); } } catch (IOException e) { throw new RuntimeException(e); } return builder.build(); }
Example #9
Source File: EdgeNGramTokenizerFactory.java From Elasticsearch with Apache License 2.0 | 5 votes |
public EdgeNGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) { super(index, indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.side = Lucene43EdgeNGramTokenizer.Side.getSide(settings.get("side", Lucene43EdgeNGramTokenizer.DEFAULT_SIDE.getLabel())); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings); }
Example #10
Source File: NGramTokenizerFactory.java From Elasticsearch with Apache License 2.0 | 5 votes |
NGramTokenizerFactory(Index index, Settings indexSettings, String name, Settings settings) { super(index, indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.matcher = parseTokenChars(settings.getAsArray("token_chars")); this.esVersion = org.elasticsearch.Version.indexCreated(indexSettings); }
Example #11
Source File: NGramTokenizerFactory.java From crate with Apache License 2.0 | 5 votes |
NGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); int maxAllowedNgramDiff = indexSettings.getMaxNgramDiff(); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); int ngramDiff = maxGram - minGram; if (ngramDiff > maxAllowedNgramDiff) { deprecationLogger.deprecated("Deprecated big difference between max_gram and min_gram in NGram Tokenizer," + "expected difference must be less than or equal to: [" + maxAllowedNgramDiff + "]"); } this.matcher = parseTokenChars(settings.getAsList("token_chars")); }
Example #12
Source File: NGramTokenizerFactory.java From crate with Apache License 2.0 | 5 votes |
@Override public Tokenizer create() { if (matcher == null) { return new NGramTokenizer(minGram, maxGram); } else { return new NGramTokenizer(minGram, maxGram) { @Override protected boolean isTokenChar(int chr) { return matcher.isTokenChar(chr); } }; } }
Example #13
Source File: NGramAnalyzer.java From onedev with MIT License | 4 votes |
@Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer src = new NGramTokenizer(minGram, maxGram); TokenStream stream = new LowerCaseFilter(src); return new TokenStreamComponents(src, stream); }
Example #14
Source File: EdgeNGramTokenizerFactory.java From crate with Apache License 2.0 | 4 votes |
EdgeNGramTokenizerFactory(IndexSettings indexSettings, Environment environment, String name, Settings settings) { super(indexSettings, name, settings); this.minGram = settings.getAsInt("min_gram", NGramTokenizer.DEFAULT_MIN_NGRAM_SIZE); this.maxGram = settings.getAsInt("max_gram", NGramTokenizer.DEFAULT_MAX_NGRAM_SIZE); this.matcher = parseTokenChars(settings.getAsList("token_chars")); }