Java Code Examples for org.elasticsearch.test.ESTestCase#TestAnalysis

The following examples show how to use org.elasticsearch.test.ESTestCase#TestAnalysis . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: LemmagenAnalysisTest.java    From elasticsearch-analysis-lemmagen with Apache License 2.0 6 votes vote down vote up
public void testLemmagenTokenFilter() throws IOException {
  ESTestCase.TestAnalysis analysis = createAnalysis();

  String source = "Děkuji, že jsi přišel.";
  String[] expected = { "Děkovat", "že", "být", "přijít" };
  String[] filters = { "lemmagen_lexicon", "lemmagen_lexicon_with_ext", "lemmagen_lexicon_path" };

  for (String filter : filters) {
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get(filter);
    assertThat(tokenFilter, instanceOf(LemmagenFilterFactory.class));

    Tokenizer tokenizer = new UAX29URLEmailTokenizer();
    tokenizer.setReader(new StringReader(source));

    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
  }
}
 
Example 2
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformNFD() throws Exception {
    String source = "Alphabētikós Katálogos";
    String[] expected = new String[] { "Alphabetikos", "Katalogos" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_nfd").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_nfd");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 3
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testIgnorePunctuation() throws Exception {
    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.language", "en")
            .put("index.analysis.analyzer.myAnalyzer.strength", "primary")
            .put("index.analysis.analyzer.myAnalyzer.alternate", "shifted")
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");
    TokenStream tsPunctuation = analyzer.tokenStream("content", "foo-bar");
    BytesRef b1 = bytesFromTokenStream(tsPunctuation);
    TokenStream tsWithoutPunctuation = analyzer.tokenStream("content", "foo bar");
    BytesRef b2 = bytesFromTokenStream(tsWithoutPunctuation);
    assertTrue(compare(b1.bytes, b2.bytes) == 0);
}
 
Example 4
Source File: WordDelimiterFilter2Tests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testOffsets() throws Exception {
    String resource = "worddelimiter.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));

    Tokenizer tokenizer = analysis.tokenizer.get("keyword").create();
    tokenizer.setReader(new StringReader("foo-bar"));
    TokenStream ts = analysis.tokenFilter.get("wd").create(tokenizer);

    assertTokenStreamContents(ts,
            new String[]{"foo", "bar", "foobar"},
            new int[]{0, 4, 0},
            new int[]{3, 7, 7},
            null, null, null, null, false);
}
 
Example 5
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformTraditionalSimplified() throws Exception {
    String source = "簡化字";
    String[] expected =  new String[] { "简化", "字" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 6
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformHanLatin() throws Exception {
    String source = "中国";
    String[] expected =  new String[] { "zhōng guó" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_han").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_han");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 7
Source File: UnstemmedGermanNormalizationTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTwo() throws Exception {
    String source = "So wird's was: das Elasticsearch-Buch erscheint beim O'Reilly-Verlag.";
    String[] expected = {
            "wird's",
            "elasticsearch-buch",
            "elasticsearchbuch",
            "erscheint",
            "o'reilly-verlag",
            "o'reillyverlag"
    };
    String resource = "unstemmed.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("default");
    assertTokenStreamContents(analyzer.tokenStream(null, new StringReader(source)), expected);
}
 
Example 8
Source File: SymbolnameTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testSimple() throws Exception {

        String source = "Programmieren mit C++";

        String[] expected = {
                "Programmieren",
                "mit",
                "C++",
                "C __PLUSSIGN__ __PLUSSIGN__",
                "C",
                "__PLUSSIGN__",
                "__PLUSSIGN__"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("symbolname");
        Tokenizer tokenizer = analysis.tokenizer.get("whitespace").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 9
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformCyrillicLatinReverse() throws Exception {
    String source = "Rossijskaâ Federaciâ";
    String[] expected = new String[] { "Российская", "Федерация"};
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr_reverse");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 10
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testNumerics() throws Exception {
    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.language", "en")
            .put("index.analysis.analyzer.myAnalyzer.numeric", true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");
    TokenStream tsNine = analyzer.tokenStream(null, "foobar-9");
    BytesRef b1 = bytesFromTokenStream(tsNine);
    TokenStream tsTen = analyzer.tokenStream(null, "foobar-10");
    BytesRef b2 = bytesFromTokenStream(tsTen);
    assertTrue(compare(b1.bytes, b2.bytes) == -1);
}
 
Example 11
Source File: IcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testIdentifierNonBreak() throws Exception {
    String source = "ISBN 3-428-84350-9";
    String[] expected = {"ISBN", "3-428-84350-9"};
    String resource = "icu_tokenizer.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_icu_tokenizer").create();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenizer, expected);
}
 
Example 12
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testCustomRules() throws Exception {
    RuleBasedCollator baseCollator = (RuleBasedCollator) Collator.getInstance(new ULocale("de_DE"));
    String DIN5007_2_tailorings =
            "& ae , a\u0308 & AE , A\u0308& oe , o\u0308 & OE , O\u0308& ue , u\u0308 & UE , u\u0308";

    RuleBasedCollator tailoredCollator = new RuleBasedCollator(baseCollator.getRules() + DIN5007_2_tailorings);
    String tailoredRules = tailoredCollator.getRules();

    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.rules", tailoredRules)
            .put("index.analysis.analyzer.myAnalyzer.strength", "primary")
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");

    String germanUmlaut = "Töne";
    TokenStream tsUmlaut = analyzer.tokenStream(null, germanUmlaut);
    BytesRef b1 = bytesFromTokenStream(tsUmlaut);

    String germanExpandedUmlaut = "Toene";
    TokenStream tsExpanded = analyzer.tokenStream(null, germanExpandedUmlaut);
    BytesRef b2 = bytesFromTokenStream(tsExpanded);

    assertTrue(compare(b1.bytes, b2.bytes) == 0);
}
 
Example 13
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testIgnoreAccentsButNotCase() throws Exception {
    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.language", "en")
            .put("index.analysis.analyzer.myAnalyzer.strength", "primary")
            .put("index.analysis.analyzer.myAnalyzer.caseLevel", "true")
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");

    String withAccents = "résumé";
    String withoutAccents = "resume";
    String withAccentsUpperCase = "Résumé";
    String withoutAccentsUpperCase = "Resume";

    TokenStream tsWithAccents = analyzer.tokenStream(null, withAccents);
    BytesRef b1 = bytesFromTokenStream(tsWithAccents);
    TokenStream tsWithoutAccents = analyzer.tokenStream(null, withoutAccents);
    BytesRef b2 = bytesFromTokenStream(tsWithoutAccents);
    assertTrue(compare(b1.bytes, b2.bytes) == 0);

    TokenStream tsWithAccentsUpperCase = analyzer.tokenStream(null, withAccentsUpperCase);
    BytesRef b3 = bytesFromTokenStream(tsWithAccentsUpperCase);
    TokenStream tsWithoutAccentsUpperCase = analyzer.tokenStream(null, withoutAccentsUpperCase);
    BytesRef b4 = bytesFromTokenStream(tsWithoutAccentsUpperCase);
    assertTrue(compare(b3.bytes, b4.bytes) == 0);

    // now check that case still matters: resume < Resume
    TokenStream tsLower = analyzer.tokenStream(null, withoutAccents);
    BytesRef b5 = bytesFromTokenStream(tsLower);
    TokenStream tsUpper = analyzer.tokenStream(null, withoutAccentsUpperCase);
    BytesRef b6 = bytesFromTokenStream(tsUpper);
    assertTrue(compare(b5.bytes, b6.bytes) < 0);
}
 
Example 14
Source File: IcuNormalizeCharTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testFoldingAnalyzerWithExceptions() throws Exception {
    String resource = "icu_normalize.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("my_icu_analyzer_with_exceptions");
    TokenStream ts = analyzer.tokenStream("test", "Jörg Prante");
    String[] expected = { "jörg", "prante" };
    assertTokenStreamContents(ts, expected);
}
 
Example 15
Source File: GermanNormalizationTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testGerman1() throws IOException {

        String source = "Ein schöner Tag in Köln im Café an der Straßenecke";

        String[] expected = {
            "Ein",
            "schoner",
            "Tag",
            "in",
            "Koln",
            "im",
            "Café",
            "an",
            "der",
            "Strassenecke"
        };
        String resource = "german_normalization_analysis.json";
        Settings settings = Settings.builder()
                .put(IndexMetaData.SETTING_VERSION_CREATED, Version.CURRENT)
                .put("path.home", System.getProperty("path.home"))
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));

        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("umlaut");
        Tokenizer tokenizer = analysis.tokenizer.get("standard").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 16
Source File: BaseformTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testOne() throws Exception {

        String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet";

        String[] expected = {
            "Die",
            "Die",
            "Jahresfeier",
            "Jahresfeier",
            "der",
            "der",
            "Rechtsanwaltskanzleien",
            "Rechtsanwaltskanzlei",
            "auf",
            "auf",
            "dem",
            "der",
            "Donaudampfschiff",
            "Donaudampfschiff",
            "hat",
            "haben",
            "viel",
            "viel",
            "Ökosteuer",
            "Ökosteuer",
            "gekostet",
            "kosten"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform");
        Tokenizer tokenizer = analysis.tokenizer.get("standard").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 17
Source File: UnstemmedGermanNormalizationTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testFive() throws Exception {
    String source = "Schroeder";
    String[] expected = {
            "schroder"
    };
    String resource = "unstemmed.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("unstemmed");
    assertTokenStreamContents(analyzer.tokenStream("test", new StringReader(source)), expected);
}
 
Example 18
Source File: FstDecompoundTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testDecompound() throws Exception {

        String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet";

        String[] expected = {
                "Die",
                "Jahresfeier",
                "jahres",
                "feier",
                "der",
                "Rechtsanwaltskanzleien",
                "rechts",
                "anwalts",
                "kanzleien",
                "auf",
                "dem",
                "Donaudampfschiff",
                "donau",
                "dampf",
                "schiff",
                "hat",
                "viel",
                "Ökosteuer",
                "ökos",
                "teuer",
                "gekostet"
        };

        Settings settings = Settings.builder()
                .put("index.analysis.analyzer.myanalyzer.type", "custom")
                .put("index.analysis.analyzer.myanalyzer.tokenizer", "standard")
                .put("index.analysis.analyzer.myanalyzer.filter.0", "fst_decompound")
                .put("index.analysis.analyzer.myanalyzer.filter.1", "unique")
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
        Analyzer myanalyzer = analysis.indexAnalyzers.get("myanalyzer");
        assertAnalyzesTo(myanalyzer, source, expected);
    }
 
Example 19
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testPrimaryStrengthFromJson() throws Exception {
    String resource = "icu_collation.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));

    Analyzer analyzer = analysis.indexAnalyzers.get("icu_german_collate");

    String[] words = new String[]{
            "Göbel",
            "Goethe",
            "Goldmann",
            "Göthe",
            "Götz"
    };
    MultiMap<BytesRef,String> bytesRefMap = new TreeMultiMap<>();
    for (String s : words) {
        TokenStream ts = analyzer.tokenStream(null, s);
        bytesRefMap.put(bytesFromTokenStream(ts), s);
    }
    Iterator<Set<String>> it = bytesRefMap.values().iterator();
    assertEquals("[Göbel]",it.next().toString());
    assertEquals("[Goethe, Göthe]",it.next().toString());
    assertEquals("[Götz]",it.next().toString());
    assertEquals("[Goldmann]",it.next().toString());
}
 
Example 20
Source File: IcuFoldingFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testFoldingAnalyzerWithExceptions() throws Exception {
    String resource = "icu_folding.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));

    Analyzer analyzer = analysis.indexAnalyzers.get("my_icu_analyzer_with_exceptions");
    TokenStream ts = analyzer.tokenStream("test", "Jörg Prante");
    String[] expected = { "jörg", "prante" };
    assertTokenStreamContents(ts, expected);
}