Java Code Examples for org.elasticsearch.test.ESTestCase#TestAnalysis
The following examples show how to use
org.elasticsearch.test.ESTestCase#TestAnalysis .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LemmagenAnalysisTest.java From elasticsearch-analysis-lemmagen with Apache License 2.0 | 6 votes |
public void testLemmagenTokenFilter() throws IOException { ESTestCase.TestAnalysis analysis = createAnalysis(); String source = "Děkuji, že jsi přišel."; String[] expected = { "Děkovat", "že", "být", "přijít" }; String[] filters = { "lemmagen_lexicon", "lemmagen_lexicon_with_ext", "lemmagen_lexicon_path" }; for (String filter : filters) { TokenFilterFactory tokenFilter = analysis.tokenFilter.get(filter); assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class)); Tokenizer tokenizer = new UAX29URLEmailTokenizer(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); } }
Example 2
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformNFD() throws Exception { String source = "Alphabētikós Katálogos"; String[] expected = new String[] { "Alphabetikos", "Katalogos" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_nfd").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_nfd"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example 3
Source File: IcuCollationAnalyzerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testIgnorePunctuation() throws Exception { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.alternate", "shifted") .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer"); TokenStream tsPunctuation = analyzer.tokenStream("content", "foo-bar"); BytesRef b1 = bytesFromTokenStream(tsPunctuation); TokenStream tsWithoutPunctuation = analyzer.tokenStream("content", "foo bar"); BytesRef b2 = bytesFromTokenStream(tsWithoutPunctuation); assertTrue(compare(b1.bytes, b2.bytes) == 0); }
Example 4
Source File: WordDelimiterFilter2Tests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testOffsets() throws Exception { String resource = "worddelimiter.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("keyword").create(); tokenizer.setReader(new StringReader("foo-bar")); TokenStream ts = analysis.tokenFilter.get("wd").create(tokenizer); assertTokenStreamContents(ts, new String[]{"foo", "bar", "foobar"}, new int[]{0, 4, 0}, new int[]{3, 7, 7}, null, null, null, null, false); }
Example 5
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformTraditionalSimplified() throws Exception { String source = "簡化字"; String[] expected = new String[] { "简化", "字" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example 6
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformHanLatin() throws Exception { String source = "中国"; String[] expected = new String[] { "zhōng guó" }; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_han").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_han"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example 7
Source File: UnstemmedGermanNormalizationTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTwo() throws Exception { String source = "So wird's was: das Elasticsearch-Buch erscheint beim O'Reilly-Verlag."; String[] expected = { "wird's", "elasticsearch-buch", "elasticsearchbuch", "erscheint", "o'reilly-verlag", "o'reillyverlag" }; String resource = "unstemmed.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin()); Analyzer analyzer = analysis.indexAnalyzers.get("default"); assertTokenStreamContents(analyzer.tokenStream(null, new StringReader(source)), expected); }
Example 8
Source File: SymbolnameTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testSimple() throws Exception { String source = "Programmieren mit C++"; String[] expected = { "Programmieren", "mit", "C++", "C __PLUSSIGN__ __PLUSSIGN__", "C", "__PLUSSIGN__", "__PLUSSIGN__" }; ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new BundlePlugin(Settings.EMPTY)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("symbolname"); Tokenizer tokenizer = analysis.tokenizer.get("whitespace").create(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example 9
Source File: IcuTransformFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testTransformCyrillicLatinReverse() throws Exception { String source = "Rossijskaâ Federaciâ"; String[] expected = new String[] { "Российская", "Федерация"}; String resource = "icu_transform.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create(); tokenizer.setReader(new StringReader(source)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr_reverse"); TokenStream tokenStream = tokenFilter.create(tokenizer); assertTokenStreamContents(tokenStream, expected); }
Example 10
Source File: IcuCollationAnalyzerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 6 votes |
public void testNumerics() throws Exception { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.numeric", true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer"); TokenStream tsNine = analyzer.tokenStream(null, "foobar-9"); BytesRef b1 = bytesFromTokenStream(tsNine); TokenStream tsTen = analyzer.tokenStream(null, "foobar-10"); BytesRef b2 = bytesFromTokenStream(tsTen); assertTrue(compare(b1.bytes, b2.bytes) == -1); }
Example 11
Source File: IcuTokenizerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testIdentifierNonBreak() throws Exception { String source = "ISBN 3-428-84350-9"; String[] expected = {"ISBN", "3-428-84350-9"}; String resource = "icu_tokenizer.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_icu_tokenizer").create(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenizer, expected); }
Example 12
Source File: IcuCollationAnalyzerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testCustomRules() throws Exception { RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE")); String DIN5007_2_tailorings = "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308"; RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings); String tailoredRules = tailoredCollator.getRules(); Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.rules", tailoredRules) .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer"); String germanUmlaut = "Töne"; TokenStream tsUmlaut = analyzer.tokenStream(null, germanUmlaut); BytesRef b1 = bytesFromTokenStream(tsUmlaut); String germanExpandedUmlaut = "Toene"; TokenStream tsExpanded = analyzer.tokenStream(null, germanExpandedUmlaut); BytesRef b2 = bytesFromTokenStream(tsExpanded); assertTrue(compare(b1.bytes, b2.bytes) == 0); }
Example 13
Source File: IcuCollationAnalyzerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testIgnoreAccentsButNotCase() throws Exception { Settings settings = Settings.builder() .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation") .put("index.analysis.analyzer.myAnalyzer.language", "en") .put("index.analysis.analyzer.myAnalyzer.strength", "primary") .put("index.analysis.analyzer.myAnalyzer.caseLevel", "true") .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer"); String withAccents = "résumé"; String withoutAccents = "resume"; String withAccentsUpperCase = "Résumé"; String withoutAccentsUpperCase = "Resume"; TokenStream tsWithAccents = analyzer.tokenStream(null, withAccents); BytesRef b1 = bytesFromTokenStream(tsWithAccents); TokenStream tsWithoutAccents = analyzer.tokenStream(null, withoutAccents); BytesRef b2 = bytesFromTokenStream(tsWithoutAccents); assertTrue(compare(b1.bytes, b2.bytes) == 0); TokenStream tsWithAccentsUpperCase = analyzer.tokenStream(null, withAccentsUpperCase); BytesRef b3 = bytesFromTokenStream(tsWithAccentsUpperCase); TokenStream tsWithoutAccentsUpperCase = analyzer.tokenStream(null, withoutAccentsUpperCase); BytesRef b4 = bytesFromTokenStream(tsWithoutAccentsUpperCase); assertTrue(compare(b3.bytes, b4.bytes) == 0); // now check that case still matters: resume < Resume TokenStream tsLower = analyzer.tokenStream(null, withoutAccents); BytesRef b5 = bytesFromTokenStream(tsLower); TokenStream tsUpper = analyzer.tokenStream(null, withoutAccentsUpperCase); BytesRef b6 = bytesFromTokenStream(tsUpper); assertTrue(compare(b5.bytes, b6.bytes) < 0); }
Example 14
Source File: IcuNormalizeCharTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testFoldingAnalyzerWithExceptions() throws Exception { String resource = "icu_normalize.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Analyzer analyzer = analysis.indexAnalyzers.get("my_icu_analyzer_with_exceptions"); TokenStream ts = analyzer.tokenStream("test", "Jörg Prante"); String[] expected = { "jörg", "prante" }; assertTokenStreamContents(ts, expected); }
Example 15
Source File: GermanNormalizationTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testGerman1() throws IOException { String source = "Ein schöner Tag in Köln im Café an der Straßenecke"; String[] expected = { "Ein", "schoner", "Tag", "in", "Koln", "im", "Café", "an", "der", "Strassenecke" }; String resource = "german_normalization_analysis.json"; Settings settings = Settings.builder() .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT) .put("path.home", System.getProperty("path.home")) .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("umlaut"); Tokenizer tokenizer = analysis.tokenizer.get("standard").create(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example 16
Source File: BaseformTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testOne() throws Exception { String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet"; String[] expected = { "Die", "Die", "Jahresfeier", "Jahresfeier", "der", "der", "Rechtsanwaltskanzleien", "Rechtsanwaltskanzlei", "auf", "auf", "dem", "der", "Donaudampfschiff", "Donaudampfschiff", "hat", "haben", "viel", "viel", "Ökosteuer", "Ökosteuer", "gekostet", "kosten" }; ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), Settings.EMPTY, new BundlePlugin(Settings.EMPTY)); TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform"); Tokenizer tokenizer = analysis.tokenizer.get("standard").create(); tokenizer.setReader(new StringReader(source)); assertTokenStreamContents(tokenFilter.create(tokenizer), expected); }
Example 17
Source File: UnstemmedGermanNormalizationTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testFive() throws Exception { String source = "Schroeder"; String[] expected = { "schroder" }; String resource = "unstemmed.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin()); Analyzer analyzer = analysis.indexAnalyzers.get("unstemmed"); assertTokenStreamContents(analyzer.tokenStream("test", new StringReader(source)), expected); }
Example 18
Source File: FstDecompoundTokenFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testDecompound() throws Exception { String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet"; String[] expected = { "Die", "Jahresfeier", "jahres", "feier", "der", "Rechtsanwaltskanzleien", "rechts", "anwalts", "kanzleien", "auf", "dem", "Donaudampfschiff", "donau", "dampf", "schiff", "hat", "viel", "Ökosteuer", "ökos", "teuer", "gekostet" }; Settings settings = Settings.builder() .put("index.analysis.analyzer.myanalyzer.type", "custom") .put("index.analysis.analyzer.myanalyzer.tokenizer", "standard") .put("index.analysis.analyzer.myanalyzer.filter.0", "fst_decompound") .put("index.analysis.analyzer.myanalyzer.filter.1", "unique") .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin()); Analyzer myanalyzer = analysis.indexAnalyzers.get("myanalyzer"); assertAnalyzesTo(myanalyzer, source, expected); }
Example 19
Source File: IcuCollationAnalyzerTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testPrimaryStrengthFromJson() throws Exception { String resource = "icu_collation.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Analyzer analyzer = analysis.indexAnalyzers.get("icu_german_collate"); String[] words = new String[]{ "Göbel", "Goethe", "Goldmann", "Göthe", "Götz" }; MultiMap<BytesRef,String> bytesRefMap = new TreeMultiMap<>(); for (String s : words) { TokenStream ts = analyzer.tokenStream(null, s); bytesRefMap.put(bytesFromTokenStream(ts), s); } Iterator<Set<String>> it = bytesRefMap.values().iterator(); assertEquals("[Göbel]",it.next().toString()); assertEquals("[Goethe, Göthe]",it.next().toString()); assertEquals("[Götz]",it.next().toString()); assertEquals("[Goldmann]",it.next().toString()); }
Example 20
Source File: IcuFoldingFilterTests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void testFoldingAnalyzerWithExceptions() throws Exception { String resource = "icu_folding.json"; Settings settings = Settings.builder() .loadFromStream(resource, getClass().getResourceAsStream(resource), true) .build(); ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"), settings, new BundlePlugin(Settings.EMPTY)); Analyzer analyzer = analysis.indexAnalyzers.get("my_icu_analyzer_with_exceptions"); TokenStream ts = analyzer.tokenStream("test", "Jörg Prante"); String[] expected = { "jörg", "prante" }; assertTokenStreamContents(ts, expected); }