org.elasticsearch.index.analysis.TokenFilterFactory Java Examples
The following examples show how to use
org.elasticsearch.index.analysis.TokenFilterFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: IcuAnalysisTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testDefaultsIcuAnalysis() throws IOException { TestAnalysis analysis = createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new BundlePlugin(Settings.EMPTY)); CharFilterFactory charFilterFactory = analysis.charFilter.get("icu_normalizer"); assertThat(charFilterFactory, instanceOf(IcuNormalizerCharFilterFactory.class)); TokenizerFactory tf = analysis.tokenizer.get("icu_tokenizer"); assertThat(tf, instanceOf(IcuTokenizerFactory.class)); TokenFilterFactory filterFactory = analysis.tokenFilter.get("icu_normalizer"); assertThat(filterFactory, instanceOf(IcuNormalizerTokenFilterFactory.class)); filterFactory = analysis.tokenFilter.get("icu_folding"); assertThat(filterFactory, instanceOf(IcuFoldingTokenFilterFactory.class)); filterFactory = analysis.tokenFilter.get("icu_transform"); assertThat(filterFactory, instanceOf(IcuTransformTokenFilterFactory.class)); Analyzer analyzer = analysis.indexAnalyzers.get( "icu_collation"); assertThat(analyzer, instanceOf(NamedAnalyzer.class)); }
Example #2
Source File: HyphenTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testFive() throws Exception { String source = "978-1-4493-5854-9"; String[] expected = { "978-1-4493-5854-9" }; String resource = "hyphen_tokenizer.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen"); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example #3
Source File: HyphenTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTwo() throws Exception { String source = "Das E-Book muss dringend zum Buchbinder."; String[] expected = { "Das", "E-Book", "EBook", "Book", "muss", "dringend", "zum", "Buchbinder" }; String resource = "hyphen_tokenizer.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen"); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example #4
Source File: HyphenTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testOne() throws Exception { String source = "Das ist ein Bindestrich-Wort."; String[] expected = { "Das", "ist", "ein", "Bindestrich-Wort", "BindestrichWort", "Wort", "Bindestrich" }; String resource = "hyphen_tokenizer.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #5
Source File: BaseformTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testThree() throws Exception { String source = "wurde zum tollen gemacht"; String[] expected = { "wurde", "werden", "zum", "zum", "tollen", "tollen", "gemacht", "machen" }; ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new BundlePlugin(Settings.EMPTY)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform"); Tokenizer tokenizer = analysis.tokenizer.get("standard").create(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example #6
Source File: BaseformTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTwo() throws Exception { String source = "Das sind Autos, die Nudeln transportieren."; String[] expected = { "Das", "Das", "sind", "sind", "Autos", "Auto", "die", "der", "Nudeln", "Nudel", "transportieren", "transportieren" }; ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new BundlePlugin(Settings.EMPTY)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform"); Tokenizer tokenizer = analysis.tokenizer.get("standard").create(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example #7
Source File: HyphenTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testSix() throws Exception { String source = "E-Book"; String[] expected = { "E-Book", "EBook", "Book" }; String resource = "hyphen_tokenizer.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen"); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example #8
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformRules() throws Exception { String source = "abacadaba"; String[] expected = new String[] { "bcbcbdbcb" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_rules").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_rules"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #9
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformNFD() throws Exception { String source = "Alphabētikós Katálogos"; String[] expected = new String[] { "Alphabetikos", "Katalogos" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_nfd").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_nfd"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #10
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformAnyLatin() throws Exception { String source = "Αλφαβητικός Κατάλογος"; String[] expected = new String[] { "Alphabētikós", "Katálogos" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_any_latin").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_any_latin"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #11
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformCyrillicLatinReverse() throws Exception { String source = "Rossijskaâ Federaciâ"; String[] expected = new String[] { "Российская", "Федерация"}; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr_reverse"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #12
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformCyrillicLatin() throws Exception { String source = "Российская Федерация"; String[] expected = new String[] { "Rossijskaâ", "Federaciâ" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #13
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformKatakanaHiragana() throws Exception { String source = "ヒラガナ"; String[] expected = new String[] { "ひらがな" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_katakana").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_katakana"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #14
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformHanLatin() throws Exception { String source = "中国"; String[] expected = new String[] { "zhōng guó" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_han").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_han"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #15
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformTraditionalSimplified() throws Exception { String source = "簡化字"; String[] expected = new String[] { "简化", "字" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example #16
Source File: AnalysisModule.java From crate with Apache License 2.0 | 6 votes |
public AnalysisModule(Environment environment, List<AnalysisPlugin> plugins) throws IOException { NamedRegistry<AnalysisProvider<CharFilterFactory>> charFilters = setupCharFilters(plugins); NamedRegistry<org.apache.lucene.analysis.hunspell.Dictionary> hunspellDictionaries = setupHunspellDictionaries(plugins); HunspellService hunspellService = new HunspellService(environment.settings(), environment, hunspellDictionaries.getRegistry()); NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = setupTokenFilters(plugins, hunspellService); NamedRegistry<AnalysisProvider<TokenizerFactory>> tokenizers = setupTokenizers(plugins); NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> analyzers = setupAnalyzers(plugins); NamedRegistry<AnalysisProvider<AnalyzerProvider<?>>> normalizers = setupNormalizers(); Map<String, PreConfiguredCharFilter> preConfiguredCharFilters = setupPreConfiguredCharFilters(plugins); Map<String, PreConfiguredTokenFilter> preConfiguredTokenFilters = setupPreConfiguredTokenFilters(plugins); Map<String, PreConfiguredTokenizer> preConfiguredTokenizers = setupPreConfiguredTokenizers(plugins); Map<String, PreBuiltAnalyzerProviderFactory> preConfiguredAnalyzers = setupPreBuiltAnalyzerProviderFactories(plugins); analysisRegistry = new AnalysisRegistry(environment, charFilters.getRegistry(), tokenFilters.getRegistry(), tokenizers.getRegistry(), analyzers.getRegistry(), normalizers.getRegistry(), preConfiguredCharFilters, preConfiguredTokenFilters, preConfiguredTokenizers, preConfiguredAnalyzers); }
Example #17
Source File: SuggestUtils.java From Elasticsearch with Apache License 2.0 | 6 votes |
public static ShingleTokenFilterFactory.Factory getShingleFilterFactory(Analyzer analyzer) { if (analyzer instanceof NamedAnalyzer) { analyzer = ((NamedAnalyzer)analyzer).analyzer(); } if (analyzer instanceof CustomAnalyzer) { final CustomAnalyzer a = (CustomAnalyzer) analyzer; final TokenFilterFactory[] tokenFilters = a.tokenFilters(); for (TokenFilterFactory tokenFilterFactory : tokenFilters) { if (tokenFilterFactory instanceof ShingleTokenFilterFactory) { return ((ShingleTokenFilterFactory)tokenFilterFactory).getInnerFactory(); } else if (tokenFilterFactory instanceof ShingleTokenFilterFactory.Factory) { return (ShingleTokenFilterFactory.Factory) tokenFilterFactory; } } } return null; }
Example #18
Source File: CommonAnalysisPlugin.java From crate with Apache License 2.0 | 6 votes |
@Override public List<PreConfiguredTokenizer> getPreConfiguredTokenizers() { List<PreConfiguredTokenizer> tokenizers = new ArrayList<>(); tokenizers.add(PreConfiguredTokenizer.singleton("keyword", KeywordTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("classic", ClassicTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("uax_url_email", UAX29URLEmailTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("path_hierarchy", PathHierarchyTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("letter", LetterTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("whitespace", WhitespaceTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("ngram", NGramTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("edge_ngram", () -> new EdgeNGramTokenizer(EdgeNGramTokenizer.DEFAULT_MIN_GRAM_SIZE, EdgeNGramTokenizer.DEFAULT_MAX_GRAM_SIZE), null)); tokenizers.add(PreConfiguredTokenizer.singleton("pattern", () -> new PatternTokenizer(Regex.compile("\\W+", null), -1), null)); tokenizers.add(PreConfiguredTokenizer.singleton("thai", ThaiTokenizer::new, null)); tokenizers.add(PreConfiguredTokenizer.singleton("lowercase", XLowerCaseTokenizer::new, () -> new TokenFilterFactory() { @Override public String name() { return "lowercase"; } @Override public TokenStream create(TokenStream tokenStream) { return new LowerCaseFilter(tokenStream); } })); // Temporary shim for aliases. TODO deprecate after they are moved tokenizers.add(PreConfiguredTokenizer.singleton("PathHierarchy", PathHierarchyTokenizer::new, null)); return tokenizers; }
Example #19
Source File: SynonymPlugin.java From elasticsearch-analysis-synonym with Apache License 2.0 | 6 votes |
@Override public Map<String, AnalysisProvider<TokenFilterFactory>> getTokenFilters() { final Map<String, AnalysisProvider<TokenFilterFactory>> extra = new HashMap<>(); extra.put("synonym_filter", new AnalysisProvider<TokenFilterFactory>() { @Override public TokenFilterFactory get(final IndexSettings indexSettings, final Environment environment, final String name, final Settings settings) throws IOException { return new SynonymTokenFilterFactory(indexSettings, environment, name, settings, pluginComponent.getAnalysisRegistry()); } @Override public boolean requiresAnalysisSettings() { return true; } }); return extra; }
Example #20
Source File: MultiplexerTokenFilterFactory.java From crate with Apache License 2.0 | 6 votes |
private TokenFilterFactory chainFilters(String name, List<TokenFilterFactory> filters) { return new TokenFilterFactory() { @Override public String name() { return name; } @Override public TokenStream create(TokenStream tokenStream) { for (TokenFilterFactory tff : filters) { tokenStream = tff.create(tokenStream); } return tokenStream; } }; }
Example #21
Source File: ASCIIFoldingTokenFilterFactory.java From crate with Apache License 2.0 | 6 votes |
@Override public Object getMultiTermComponent() { if (preserveOriginal == false) { return this; } else { // See https://issues.apache.org/jira/browse/LUCENE-7536 for the reasoning return new TokenFilterFactory() { @Override public String name() { return ASCIIFoldingTokenFilterFactory.this.name(); } @Override public TokenStream create(TokenStream tokenStream) { return new ASCIIFoldingFilter(tokenStream, false); } }; } }
Example #22
Source File: BaseformTokenFilterTests.java From elasticsearch-analysis-baseform with Apache License 2.0 | 6 votes |
@Test public void testThree() throws IOException { String source = "wurde zum tollen gemacht"; String[] expected = { "wurde", "werden", "zum", "zum", "tollen", "tollen", "gemacht", "machen" }; AnalysisService analysisService = MapperTestUtils.analysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform"); Tokenizer tokenizer = analysisService.tokenizer("standard").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
Example #23
Source File: SymbolnameTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testSimple() throws Exception { String source = "Programmieren mit C++"; String[] expected = { "Programmieren", "mit", "C++", "C __PLUSSIGN__ __PLUSSIGN__", "C", "__PLUSSIGN__", "__PLUSSIGN__" }; ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new BundlePlugin(Settings.EMPTY)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("symbolname"); Tokenizer tokenizer = analysis.tokenizer.get("whitespace").create(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example #24
Source File: BaseformTokenFilterTests.java From elasticsearch-analysis-baseform with Apache License 2.0 | 6 votes |
@Test public void testTwo() throws IOException { String source = "Das sind Autos, die Nudeln transportieren."; String[] expected = { "Das", "Das", "sind", "sind", "Autos", "Auto", "die", "der", "Nudeln", "Nudel", "transportieren", "transportieren" }; AnalysisService analysisService = MapperTestUtils.analysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform"); Tokenizer tokenizer = analysisService.tokenizer("standard").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
Example #25
Source File: AnnotationIndicesAnalysis.java From elasticsearch-analysis-annotation with Apache License 2.0 | 6 votes |
@Inject public AnnotationIndicesAnalysis(Settings settings, IndicesAnalysisService indicesAnalysisService) { super(settings); indicesAnalysisService.analyzerProviderFactories().put( "default", new PreBuiltAnalyzerProviderFactory("default", AnalyzerScope.INDICES, new AnnotationAnalyzer( Lucene.ANALYZER_VERSION))); indicesAnalysisService.tokenFilterFactories().put("annotation_filter", new PreBuiltTokenFilterFactoryFactory(new TokenFilterFactory() { @Override public String name() { return "annotation_filter"; } @Override public TokenStream create(TokenStream tokenStream) { return new InlineAnnotationFilter(tokenStream); } })); }
Example #26
Source File: HyphenTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testNine() throws Exception { String source = "Das ist ein Punkt. Und noch ein Punkt für U.S.A. Oder? Nicht doch."; String[] expected = { "Das", "ist", "ein", "Punkt", "Und", "noch", "ein", "Punkt", "für", "U.S.A", "Oder", "Nicht", "doch" }; String resource = "hyphen_tokenizer_without_subwords.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_hyphen_tokenfilter"); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example #27
Source File: AnalysisModule.java From crate with Apache License 2.0 | 5 votes |
private NamedRegistry<AnalysisProvider<TokenFilterFactory>> setupTokenFilters(List<AnalysisPlugin> plugins, HunspellService hunspellService) { NamedRegistry<AnalysisProvider<TokenFilterFactory>> tokenFilters = new NamedRegistry<>("token_filter"); tokenFilters.register("stop", StopTokenFilterFactory::new); tokenFilters.register("standard", (indexSettings, environment, name, settings) -> { DEPRECATION_LOGGER.deprecatedAndMaybeLog("standard_deprecation", "The [standard] token filter name is deprecated and will be removed in a future version."); return new AbstractTokenFilterFactory(indexSettings, name, settings) { @Override public TokenStream create(TokenStream tokenStream) { return tokenStream; } }; }); tokenFilters.register("shingle", ShingleTokenFilterFactory::new); tokenFilters.register( "hunspell", requiresAnalysisSettings((indexSettings, env, name, settings) -> new HunspellTokenFilterFactory( indexSettings, name, settings, hunspellService ) ) ); tokenFilters.extractAndRegister(plugins, AnalysisPlugin::getTokenFilters); return tokenFilters; }
Example #28
Source File: ESTestCase.java From crate with Apache License 2.0 | 5 votes |
public TestAnalysis(IndexAnalyzers indexAnalyzers, Map<String, TokenFilterFactory> tokenFilter, Map<String, TokenizerFactory> tokenizer, Map<String, CharFilterFactory> charFilter) { this.indexAnalyzers = indexAnalyzers; this.tokenFilter = tokenFilter; this.tokenizer = tokenizer; this.charFilter = charFilter; }
Example #29
Source File: BaseformTokenFilterTests.java From elasticsearch-analysis-baseform with Apache License 2.0 | 5 votes |
@Test public void testOne() throws IOException { String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet"; String[] expected = { "Die", "Die", "Jahresfeier", "Jahresfeier", "der", "der", "Rechtsanwaltskanzleien", "Rechtsanwaltskanzlei", "auf", "auf", "dem", "der", "Donaudampfschiff", "Donaudampfschiff", "hat", "haben", "viel", "viel", "Ökosteuer", "Ökosteuer", "gekostet", "kosten" }; AnalysisService analysisService = MapperTestUtils.analysisService(); TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform"); Tokenizer tokenizer = analysisService.tokenizer("standard").create(); tokenizer.setReader(new StringReader(source)); assertSimpleTSOutput(tokenFilter.create(tokenizer), expected); }
Example #30
Source File: HyphenTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testSeven() throws Exception { String source = "Procter & Gamble ist Procter&Gamble. Schwarz - weiss ist schwarz-weiss"; String[] expected = { "Procter", "Gamble", "ist", "Procter&Gamble", "Schwarz", "weiss", "ist", "schwarz-weiss", "schwarzweiss", "weiss", "schwarz" }; String resource = "hyphen_tokenizer.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen"); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }