Java Code Examples for org.apache.lucene.analysis.Tokenizer#setReader()

The following examples show how to use org.apache.lucene.analysis.Tokenizer#setReader() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testSix() throws Exception {

        String source = "E-Book";

        String[] expected = {
                "E-Book",
                "EBook",
                "Book"
        };

        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 2
Source File: BaseformTokenFilterTests.java    From elasticsearch-analysis-baseform with Apache License 2.0 6 votes vote down vote up
@Test
public void testThree() throws IOException {

    String source = "wurde zum tollen gemacht";

    String[] expected = {
            "wurde",
            "werden",
            "zum",
            "zum",
            "tollen",
            "tollen",
            "gemacht",
            "machen"
    };
    AnalysisService analysisService = MapperTestUtils.analysisService();
    TokenFilterFactory tokenFilter = analysisService.tokenFilter("baseform");
    Tokenizer tokenizer = analysisService.tokenizer("standard").create();
    tokenizer.setReader(new StringReader(source));
    assertSimpleTSOutput(tokenFilter.create(tokenizer), expected);
}
 
Example 3
Source File: TestICUNormalizer2CharFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMassiveLigature() throws IOException {
  String input = "\uFDFA";

  CharFilter reader = new ICUNormalizer2CharFilter(new StringReader(input),
    Normalizer2.getInstance(null, "nfkc_cf", Normalizer2.Mode.COMPOSE));

  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  tokenStream.setReader(reader);

  assertTokenStreamContents(tokenStream,
    new String[] {"صلى", "الله", "عليه", "وسلم"},
    new int[]{0, 0, 0, 0},
    new int[]{0, 0, 0, 1},
    input.length()
  );
}
 
Example 4
Source File: TestSuggestStopFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testEndIsStopWord() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] {"go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            6,
                            new boolean[] {false},
                            true);
}
 
Example 5
Source File: HyphenTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 6 votes vote down vote up
public void testFive() throws Exception {

        String source = "978-1-4493-5854-9";

        String[] expected = {
                "978-1-4493-5854-9"
        };

        String resource = "hyphen_tokenizer.json";
        Settings settings = Settings.builder()
                .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
                .build();
        ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
                settings,
                new BundlePlugin(Settings.EMPTY));
        Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_tokenizer").create();
        tokenizer.setReader(new StringReader(source));
        TokenFilterFactory tokenFilter = analysis.tokenFilter.get("hyphen");
        assertTokenStreamContents(tokenFilter.create(tokenizer), expected);
    }
 
Example 6
Source File: TestSuggestStopFilter.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
public void testMultipleStopWordsEnd2() throws Exception {
                            
  CharArraySet stopWords = StopFilter.makeStopSet("to", "the", "a");
  Tokenizer stream = new MockTokenizer();
  stream.setReader(new StringReader("go to a the "));
  TokenStream filter = new SuggestStopFilter(stream, stopWords);
  assertTokenStreamContents(filter,
                            new String[] { "go"},
                            new int[] {0},
                            new int[] {2},
                            null,
                            new int[] {1},
                            null,
                            12,
                            new boolean[] {false},
                            true);
}
 
Example 7
Source File: IcuTokenizerTests.java    From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 5 votes vote down vote up
public void testIdentifierNonBreak() throws Exception {
    String source = "ISBN 3-428-84350-9";
    String[] expected = {"ISBN", "3-428-84350-9"};
    String resource = "icu_tokenizer.json";
    Settings settings = Settings.builder()
            .loadFromStream(resource, getClass().getResourceAsStream(resource), true)
            .build();
    ESTestCase.TestAnalysis analysis = ESTestCase.createTestAnalysis(new Index("test", "_na_"),
            settings,
            new BundlePlugin(Settings.EMPTY));
    Tokenizer tokenizer = analysis.tokenizer.get("my_hyphen_icu_tokenizer").create();
    tokenizer.setReader(new StringReader(source));
    assertTokenStreamContents(tokenizer, expected);
}
 
Example 8
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSeparatorWithStopWords() throws IOException {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  String input = "A B C D E F J H";
  tokenStream.setReader(new StringReader(input));
  TokenStream tokenFilter = new StopFilter(tokenStream, StopFilter.makeStopSet("A", "D", "E", "J"));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenFilter, '-', false, 100);

  assertTokenStreamContents(stream, new String[] {"B-C-F-H"}, null, null, new int[] { 1 });
}
 
Example 9
Source File: TestWikipediaTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testTokenizerBoth() throws Exception {
    String test = "[[Category:a b c d]] [[Category:e f g]] [[link here]] [[link there]] ''italics here'' something ''more italics'' [[Category:h   i   j]]";
    Tokenizer tf = tokenizerFactory(WIKIPEDIA, TOKEN_OUTPUT, Integer.toString(WikipediaTokenizer.BOTH), UNTOKENIZED_TYPES, WikipediaTokenizer.CATEGORY + ", " + WikipediaTokenizer.ITALICS).create(newAttributeFactory());
    tf.setReader(new StringReader(test));
    assertTokenStreamContents(tf,
                              new String[] { "a b c d", "a", "b", "c", "d", "e f g", "e", "f", "g",
                                             "link", "here", "link", "there", "italics here", "italics", "here",
                                             "something", "more italics", "more", "italics", "h   i   j", "h", "i", "j" },
                              new int[] { 11, 11, 13, 15, 17, 32, 32, 34, 36, 42, 47, 56, 61, 71, 71, 79, 86, 98,  98,  103, 124, 124, 128, 132 },
                              new int[] { 18, 12, 14, 16, 18, 37, 33, 35, 37, 46, 51, 60, 66, 83, 78, 83, 95, 110, 102, 110, 133, 125, 129, 133 },
                              new int[] { 1,  0,  1,  1,  1,  1,  0,  1,  1,  1,  1,  1,  1,  1,  0,  1,  1,  1,   0,   1,   1,   0,   1,   1 }
    );
}
 
Example 10
Source File: TestKoreanTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSimple() throws IOException {
  KoreanTokenizerFactory factory = new KoreanTokenizerFactory(Collections.emptyMap());
  factory.inform(new StringMockResourceLoader(""));
  Tokenizer ts = factory.create(newAttributeFactory());
  ts.setReader(new StringReader("안녕하세요"));
  assertTokenStreamContents(ts,
      new String[] { "안녕", "하", "시", "어요" },
      new int[] { 0, 2, 3, 3 },
      new int[] { 2, 3, 5, 5 }
  );
}
 
Example 11
Source File: SimplePhoneticAnalysisTests.java    From crate with Apache License 2.0 5 votes vote down vote up
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("Rimbault"));
    String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
            "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}
 
Example 12
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testFixedToken() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("aaaa");

  t.setReader(new StringReader("aaaaaaaaaaaaaaa"));
  assertTokenStreamContents(t,
                            new String[] {"aaaa", "aaaa", "aaaa"},
                            new int[] {0, 4, 8},
                            new int[] {4, 8, 12});
}
 
Example 13
Source File: TestSimplePatternTokenizer.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testEndLookahead() throws Exception {
  Tokenizer t = new SimplePatternTokenizer("(ab)+");
  t.setReader(new StringReader("aba"));
  assertTokenStreamContents(t,
      new String[] { "ab" },
      new int[] { 0 },
      new int[] { 2 },
      3);
}
 
Example 14
Source File: TestUAX29URLEmailTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testChinese() throws Exception {
  Reader reader = new StringReader("我是中国人。 1234 Tests ");
  Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
  stream.setReader(reader);
  assertTokenStreamContents(stream, 
      new String[] { "我", "是", "中", "国", "人", "1234", "Tests" });
}
 
Example 15
Source File: TestConcatenateGraphFilter.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSeparator() throws IOException {
  Tokenizer tokenStream = new MockTokenizer(MockTokenizer.SIMPLE, true);
  String input = "...mykeyword.another.keyword.";
  tokenStream.setReader(new StringReader(input));
  ConcatenateGraphFilter stream = new ConcatenateGraphFilter(tokenStream, ' ', false, 100); //not \u001F
  assertTokenStreamContents(stream, new String[] {"mykeyword another keyword"}, null, null, new int[] { 1 });
}
 
Example 16
Source File: TestCharTokenizers.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testMaxWordLength() throws IOException {
  StringBuilder builder = new StringBuilder();

  for (int i = 0; i < 255; i++) {
    builder.append("A");
  }
  Tokenizer tokenizer = new LetterTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(builder.toString() + builder.toString()));
  assertTokenStreamContents(new LowerCaseFilter(tokenizer), new String[] {builder.toString().toLowerCase(Locale.ROOT), builder.toString().toLowerCase(Locale.ROOT)});
}
 
Example 17
Source File: TestUAX29URLEmailTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testURLs() throws Exception {
  String textWithURLs 
    = "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on\n"
      + " some extra\nWords thrown in here. "
      + "http://c5-3486.bisynxu.FR/aI.YnNms/"
      + " samba Halta gamba "
      + "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R\n"
      + "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb\n"
      + "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m"
      + " inter Locutio "
      + "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/\n"
      + "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7"
      + " blah Sirrah woof "
      + "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4\n";
  Reader reader = new StringReader(textWithURLs);
  Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
  stream.setReader(reader);
  assertTokenStreamContents(stream, 
      new String[] { 
        "http://johno.jsmf.net/knowhow/ngrams/index.php?table=en-dickens-word-2gram&paragraphs=50&length=200&no-ads=on",
        "some", "extra", "Words", "thrown", "in", "here",
        "http://c5-3486.bisynxu.FR/aI.YnNms/",
        "samba", "Halta", "gamba",
        "ftp://119.220.152.185/JgJgdZ/31aW5c/viWlfQSTs5/1c8U5T/ih5rXx/YfUJ/xBW1uHrQo6.R",
        "M19nq.0URV4A.Me.CC/mj0kgt6hue/dRXv8YVLOw9v/CIOqb",
        "Https://yu7v33rbt.vC6U3.XN--KPRW13D/y%4fMSzkGFlm/wbDF4m",
        "inter", "Locutio",
        "[c2d4::]/%471j5l/j3KFN%AAAn/Fip-NisKH/",
        "file:///aXvSZS34is/eIgM8s~U5dU4Ifd%c7",
        "blah", "Sirrah", "woof",
        "http://[a42:a7b6::]/qSmxSUU4z/%52qVl4"
      }
  );
}
 
Example 18
Source File: TestStandardFactories.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testClassicTokenizerMaxTokenLength() throws Exception {
  StringBuilder builder = new StringBuilder();
  for (int i = 0 ; i < 100 ; ++i) {
    builder.append("abcdefg"); // 7 * 100 = 700 char "word"
  }
  String longWord = builder.toString();
  String content = "one two three " + longWord + " four five six";
  Reader reader = new StringReader(content);
  Tokenizer stream = tokenizerFactory("Classic",
      "maxTokenLength", "1000").create(newAttributeFactory());
  stream.setReader(reader);
  assertTokenStreamContents(stream,
      new String[]{"one", "two", "three", longWord, "four", "five", "six"});
}
 
Example 19
Source File: TestUAX29URLEmailTokenizerFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testKorean() throws Exception {
  Reader reader = new StringReader("안녕하세요 한글입니다");
  Tokenizer stream = tokenizerFactory("UAX29URLEmail").create(newAttributeFactory());
  stream.setReader(reader);
  assertTokenStreamContents(stream, 
      new String[] { "안녕하세요", "한글입니다" });
}
 
Example 20
Source File: TestDaitchMokotoffSoundexFilterFactory.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public void testSettingInject() throws Exception {
  Map<String,String> parameters = new HashMap<>();
  parameters.put("inject", "false");
  DaitchMokotoffSoundexFilterFactory factory = new DaitchMokotoffSoundexFilterFactory(parameters);

  Tokenizer inputStream = new MockTokenizer(MockTokenizer.WHITESPACE, false);
  inputStream.setReader(new StringReader("international"));

  TokenStream filteredStream = factory.create(inputStream);
  assertEquals(DaitchMokotoffSoundexFilter.class, filteredStream.getClass());
  assertTokenStreamContents(filteredStream, new String[] { "063963" });
}