Java Code Examples for org.elasticsearch.test.ESTestCase#createTestAnalysis()

The following examples show how to use org.elasticsearch.test.ESTestCase#createTestAnalysis() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testSix() throws Exception {

        String source = "E-Book";

        String[] expected = {
                "E-Book",
                "EBook",
                "Book"
        };

        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 2
Source File: IcuFoldingFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testFoldingCharFilter() throws Exception {
    String source = "Jörg Prante";
    String resource = "icu_folding.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Reader charFilter = analysis.charFilter.get("my_icu_folder").create(new StringReader(source));
    StringBuilder sb = new StringBuilder();
    int ch;
    while ((ch = charFilter.read()) != -1) {
        sb.append((char)ch);
    }
    assertEquals("jorg prante", sb.toString());
}
 
Example 3
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testNumerics() throws Exception {
    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.language", "en")
            .put("index.analysis.analyzer.myAnalyzer.numeric", true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");
    TokenStream tsNine = analyzer.tokenStream(null, "foobar-9");
    BytesRef b1 = bytesFromTokenStream(tsNine);
    TokenStream tsTen = analyzer.tokenStream(null, "foobar-10");
    BytesRef b2 = bytesFromTokenStream(tsTen);
    assertTrue(compare(b1.bytes, b2.bytes) == -1);
}
 
Example 4
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testBasicUsage() throws Exception {
    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.language", "tr")
            .put("index.analysis.analyzer.myAnalyzer.strength", "primary")
            .put("index.analysis.analyzer.myAnalyzer.decomposition", "canonical")
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");
    TokenStream tsUpper = analyzer.tokenStream(null, "I WİLL USE TURKİSH CASING");
    BytesRef b1 = bytesFromTokenStream(tsUpper);
    TokenStream tsLower = analyzer.tokenStream(null, "ı will use turkish casıng");
    BytesRef b2 = bytesFromTokenStream(tsLower);
    assertTrue(compare(b1.bytes, b2.bytes) == 0);
}
 
Example 5
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformNFD() throws Exception {
    String source = "Alphabētikós Katálogos";
    String[] expected = new String[] { "Alphabetikos", "Katalogos" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_nfd").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_nfd");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 6
Source File: UnstemmedGermanNormalizationTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testSix() throws Exception {
    String source = "Programmieren in C++ für Einsteiger";
    String[] expected = {
            "programmieren",
            "programmi",
            "c++",
            "einsteiger",
            "einsteig"
    };
    String resource = "unstemmed.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("default");
    assertTokenStreamContents(analyzer.tokenStream(null, new StringReader(source)), expected);
}
 
Example 7
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformCyrillicLatin() throws Exception {
    String source = "Российская Федерация";
    String[] expected = new String[] { "Rossijskaâ", "Federaciâ" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_cyr").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_cyr");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 8
Source File: UnstemmedGermanNormalizationTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testFour() throws Exception {
    String source = "Prante, Jörg";
    String[] expected = {
            "prante",
            "jorg"
    };
    String resource = "unstemmed.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("unstemmed");
    assertTokenStreamContents(analyzer.tokenStream("test", new StringReader(source)), expected);
}
 
Example 9
Source File: UnstemmedGermanNormalizationTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testThree() throws Exception {
    String source = "978-1-4493-5854-9";
    String[] expected = {
         "978-1-4493-5854-9",
         "9781449358549"
    };
    String resource = "unstemmed.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("default");
    assertTokenStreamContents(analyzer.tokenStream(null, new StringReader(source)), expected);
}
 
Example 10
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testUpperCaseFirst() throws Exception {
    Settings settings = Settings.builder()
            .put("index.analysis.analyzer.myAnalyzer.type", "icu_collation")
            .put("index.analysis.analyzer.myAnalyzer.language", "en")
            .put("index.analysis.analyzer.myAnalyzer.strength", "tertiary")
            .put("index.analysis.analyzer.myAnalyzer.caseFirst", "upper")
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("myAnalyzer");
    String lower = "resume";
    String upper = "Resume";
    TokenStream tsLower = analyzer.tokenStream(null, lower);
    BytesRef b1 = bytesFromTokenStream(tsLower);
    TokenStream tsUpper = analyzer.tokenStream(null, upper);
    BytesRef b2 = bytesFromTokenStream(tsUpper);
    assertTrue(compare(b2.bytes, b1.bytes) < 0);
}
 
Example 11
Source File: DecompoundTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testWithSubwordsOnly() throws Exception {
    String source = "Das ist ein Schlüsselwort, ein Bindestrichwort";
    String[] expected = {
            "Da",
            "ist",
            "ein",
            "Schlüssel",
            "wort",
            "ein",
            "Bindestrich",
            "wort"
    };
    String resource = "keywords_analysis.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("with_subwords_only");
    assertNotNull(analyzer);
    assertTokenStreamContents(analyzer.tokenStream("test-field", source), expected);
}
 
Example 12
Source File: IcuTransformFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testTransformTraditionalSimplified() throws Exception {
    String source = "簡化字";
    String[] expected =  new String[] { "简化", "字" };
    String resource = "icu_transform.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_icu_tokenizer_ch").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("my_icu_transformer_ch");
    TokenStream tokenStream = tokenFilter.create(tokenizer);
    assertTokenStreamContents(tokenStream, expected);
}
 
Example 13
Source File: UnstemmedGermanNormalizationTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testOne() throws Exception {
    String source = "Ein Tag in Köln im Café an der Straßenecke mit einer Standard-Nummer ISBN 1-4493-5854-3";
    String[] expected = {
            "tag",
            "koln",
            "cafe",
            "caf",
            "strassenecke",
            "strasseneck",
            "standard-nummer",
            "standardnummer",
            "standard-numm",
            "standardnumm",
            "isbn",
            "1-4493-5854-3",
            "1449358543",
            "978-1-4493-5854-9",
            "9781449358549"
    };
    String resource = "unstemmed.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY), new CommonAnalysisPlugin());
    Analyzer analyzer = analysis.indexAnalyzers.get("default");
    assertTokenStreamContents(analyzer.tokenStream(null, new StringReader(source)), expected);
}
 
Example 14
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testPrimaryStrengthFromJson() throws Exception {
    String resource = "icu_collation.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));

    Analyzer analyzer = analysis.indexAnalyzers.get("icu_german_collate");

    String[] words = new String[]{
            "Göbel",
            "Goethe",
            "Goldmann",
            "Göthe",
            "Götz"
    };
    MultiMap<BytesRef,String> bytesRefMap = new TreeMultiMap<>();
    for (String s : words) {
        TokenStream ts = analyzer.tokenStream(null, s);
        bytesRefMap.put(bytesFromTokenStream(ts), s);
    }
    Iterator<Set<String>> it = bytesRefMap.values().iterator();
    assertEquals("[Göbel]",it.next().toString());
    assertEquals("[Goethe, Göthe]",it.next().toString());
    assertEquals("[Götz]",it.next().toString());
    assertEquals("[Goldmann]",it.next().toString());
}
 
Example 15
Source File: IcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testIdentifierNonBreakSpaceTwoTokens() throws Exception {
    String source = "Binde-strich-wort 3-428-84350-9";
    String[] expected = {"Binde-strich-wort", "3-428-84350-9"};
    String resource = "icu_tokenizer.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_icu_tokenizer").create();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenizer, expected);
}
 
Example 16
Source File: IcuNormalizeCharTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testFoldingAnalyzerWithExceptions() throws Exception {
    String resource = "icu_normalize.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("my_icu_analyzer_with_exceptions");
    TokenStream ts = analyzer.tokenStream("test", "Jörg Prante");
    String[] expected = { "jörg", "prante" };
    assertTokenStreamContents(ts, expected);
}
 
Example 17
Source File: IcuCollationAnalyzerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testGermanPhoneBook() throws Exception {
    String resource = "icu_collation.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Analyzer analyzer = analysis.indexAnalyzers.get("german_phonebook");

    String[] words = new String[]{
            "Göbel",
            "Goethe",
            "Goldmann",
            "Göthe",
            "Götz"
    };
    MultiMap<BytesRef,String> bytesRefMap = new TreeMultiMap<>();
    for (String s : words) {
        TokenStream ts = analyzer.tokenStream(null, s);
        bytesRefMap.put(bytesFromTokenStream(ts), s);
    }
    Iterator<Set<String>> it = bytesRefMap.values().iterator();
    assertEquals("[Göbel]",it.next().toString());
    assertEquals("[Goethe, Göthe]",it.next().toString());
    assertEquals("[Götz]",it.next().toString());
    assertEquals("[Goldmann]",it.next().toString());
}
 
Example 18
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testThree() throws Exception {

        String source = "Ich will nicht als Service-Center-Mitarbeiterin, sondern 100-prozentig als Dipl.-Ing. arbeiten!";

        String[] expected = {
                "Ich",
                "will",
                "nicht",
                "als",
                "Service-Center-Mitarbeiterin",
                "ServiceCenterMitarbeiterin",
                "Mitarbeiterin",
                "ServiceCenter",
                "ServiceCenter-Mitarbeiterin",
                "Center-Mitarbeiterin",
                "Service",
                "sondern",
                "100-prozentig",
                "als",
                "Dipl",
                "Ing",
                "arbeiten"
        };
        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 19
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testSeven() throws Exception {
    String source = "Procter & Gamble ist Procter&Gamble. Schwarz - weiss ist schwarz-weiss";

    String[] expected = {
            "Procter",
            "Gamble",
            "ist",
            "Procter&Gamble",
            "Schwarz",
            "weiss",
            "ist",
            "schwarz-weiss",
            "schwarzweiss",
            "weiss",
            "schwarz"
    };

    String resource = "hyphen_tokenizer.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
    tokenizer.setReader(new StringReader(source));
    TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
    assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
}
 
Example 20
Source File: BaseformTokenFilterTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testOne() throws Exception {

        String source = "Die Jahresfeier der Rechtsanwaltskanzleien auf dem Donaudampfschiff hat viel Ökosteuer gekostet";

        String[] expected = {
            "Die",
            "Die",
            "Jahresfeier",
            "Jahresfeier",
            "der",
            "der",
            "Rechtsanwaltskanzleien",
            "Rechtsanwaltskanzlei",
            "auf",
            "auf",
            "dem",
            "der",
            "Donaudampfschiff",
            "Donaudampfschiff",
            "hat",
            "haben",
            "viel",
            "viel",
            "Ökosteuer",
            "Ökosteuer",
            "gekostet",
            "kosten"
        };
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                Settings.EMPTY,
                new BundlePlugin(Settings.EMPTY));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("baseform");
        Tokenizer tokenizer = analysis.tokenizer.get("standard").create();
        tokenizer.setReader(new StringReader(source));
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }