org.apache.lucene.analysis.BaseTokenStreamTestCase Java Exaples

Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

/** variation sequence */
public void testEmojiVariationSequence() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
      new String[] { "#️⃣" },
      new String[] { "<EMOJI>" });
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
      new String[] { "3️⃣",},
      new String[] { "<EMOJI>" });

  // text presentation sequences
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
      new String[] { },
      new String[] { });
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E",  // \uFE0E is included in \p{WB:Extend}
      new String[] { "3\uFE0E",},
      new String[] { "<NUM>" });
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E",     // \u2B55 = HEAVY BLACK CIRCLE
      new String[] { "\u2B55",},
      new String[] { "<EMOJI>" });
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
      new String[] { "\u2B55", "\u200D\u2B55"},
      new String[] { "<EMOJI>", "<EMOJI>" });
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

6 votes

public void testMailtoSchemeEmails () throws Exception {
  // See LUCENE-3880
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "MAILTO:[email protected]",
      new String[] {"mailto", "[email protected]"},
      new String[] { "<ALPHANUM>", "<EMAIL>" });

  // TODO: Support full mailto: scheme URIs. See RFC 6068: http://tools.ietf.org/html/rfc6068
  BaseTokenStreamTestCase.assertAnalyzesTo
      (a,  "mailto:[email protected],[email protected][email protected]"
          + "&subject=Subjectivity&body=Corpusivity%20or%20something%20like%20that",
          new String[] { "mailto",
              "[email protected]",
              // TODO: recognize ',' address delimiter. Also, see examples of ';' delimiter use at: http://www.mailto.co.uk/
              ",[email protected]",
              "[email protected]", // TODO: split field keys/values
              "subject", "subjectivity",
              "body", "corpusivity", "20or", "20something","20like", "20that" }, // TODO: Hex decoding + re-tokenization
          new String[] { "<ALPHANUM>",
              "<EMAIL>",
              "<EMAIL>",
              "<EMAIL>",
              "<ALPHANUM>", "<ALPHANUM>",
              "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" });
}

Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0

6 votes

/** variation sequence */
public void testEmojiVariationSequence() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣",
      new String[] { "#️⃣" },
      new String[] { "<EMOJI>" });
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣",
      new String[] { "3️⃣",},
      new String[] { "<EMOJI>" });

  // text presentation sequences
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E",
      new String[] { },
      new String[] { });
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E",  // \uFE0E is included in \p{WB:Extend}
      new String[] { "3\uFE0E",},
      new String[] { "<NUM>" });
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E",     // \u2B55 = HEAVY BLACK CIRCLE
      new String[] { "\u2B55",},
      new String[] { "<EMOJI>" });
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E",
      new String[] { "\u2B55", "\u200D\u2B55"},
      new String[] { "<EMOJI>", "<EMOJI>" });
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testNumericSA() throws Exception {
  // floating point, serial, model numbers, ip addresses, etc.
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}

Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}

Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testLUCENE1545() throws Exception {
  /*
   * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E.
   * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
   * Expected result is only on token "moͤchte".
   */
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
}

Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testNumericSA() throws Exception {
  // floating point, serial, model numbers, ip addresses, etc.
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"});
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"});
}

Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0

5 votes

public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("ABADIAS"));
    String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia",
            "abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS",
            "obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}

Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  StandardTokenizer tokenizer = new StandardTokenizer();
  tokenizer.setReader(new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}

Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0

5 votes

public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("Rimbault"));
    String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt",
            "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" };
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}

Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"});
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testApostrophesSA() throws Exception {
  // internal apostrophes: O'Reilly, you're, O'Reilly's
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"jim's"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"});
}

Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0

5 votes

public void testPhoneticTokenFilterDaitchMotokoff() throws IOException {
    TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff");
    Tokenizer tokenizer = new WhitespaceTokenizer();
    tokenizer.setReader(new StringReader("chauptman"));
    String[] expected = new String[] { "473660", "573660" };
    assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class));
    BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected);
}

Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testHugeDoc() throws IOException {
  StringBuilder sb = new StringBuilder();
  char whitespace[] = new char[4094];
  Arrays.fill(whitespace, ' ');
  sb.append(whitespace);
  sb.append("testing 1234");
  String input = sb.toString();
  UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory());
  tokenizer.setReader(new StringReader(input));
  BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" });
}

Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}

Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0

5 votes

public void testVariousTextSA() throws Exception {
  // various
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo      bar .  FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"});
}

Source File: ProtectedTermFilterFactoryTest.java From lucene-solr with Apache License 2.0

5 votes

public void testBasic() throws Exception {
  String text = "Wuthering FooBar distant goldeN ABC compote";
  Map<String,String> args = new HashMap<>();
  args.put("ignoreCase", "true");
  args.put("protected", "protected-1.txt,protected-2.txt");  // Protected: foobar, jaxfopbuz, golden, compote
  args.put("wrappedFilters", "lowercase");

  ResourceLoader loader = new SolrResourceLoader(TEST_PATH().resolve("collection1"));
  ProtectedTermFilterFactory factory = new ProtectedTermFilterFactory(args);
  factory.inform(loader);

  TokenStream ts = factory.create(whitespaceMockTokenizer(text));
  BaseTokenStreamTestCase.assertTokenStreamContents(ts,
      new String[] { "wuthering", "FooBar", "distant", "goldeN", "abc", "compote" });
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

5 votes

public void testLUCENE1545() throws Exception {
  /*
   * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTER E.
   * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost.
   * Expected result is only one token "moͤchte".
   */
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); 
}

Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testThai() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔",
      new String[] { "การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔" });
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testTypes() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "david has 5000 bones",
      new String[] {"david", "has", "5000", "bones"},
      new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" });
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testOffsets() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", 
      new String[] {"david", "has", "5000", "bones"},
      new int[] {0, 6, 10, 15},
      new int[] {5, 9, 14, 20});
}

Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testFarsi() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.",
      new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی",
      "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" });
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testBasicEmails() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a,
      "one [email protected] two three [[email protected]] \"ArakaBanassaMassanaBakarA\" <[email protected]>",
      new String[] {"one", "[email protected]", "two", "three", "[email protected]", "arakabanassamassanabakara", "[email protected]",},
      new String[] { "<ALPHANUM>", "<EMAIL>", "<ALPHANUM>", "<ALPHANUM>", "<EMAIL>", "<ALPHANUM>", "<EMAIL>" });
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testKoreanSA() throws Exception {
  // Korean words
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"});
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testTextWithNumbersSA() throws Exception {
  // numbers
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"});
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testDelimitersSA() throws Exception {
  // other delimiters: "-", "/", ","
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"});
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testAlphanumericSA() throws Exception {
  // alphanumeric tokens
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"b2b"});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2b"});
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testEmpty() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {});
  BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {});
}

Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0

4 votes

public void testChinese() throws Exception {
  BaseTokenStreamTestCase.assertAnalyzesTo(a, "我是中国人。 １２３４ Ｔｅｓｔｓ ",
      new String[] { "我", "是", "中", "国", "人", "１２３４", "ｔｅｓｔｓ"});
}

org.apache.lucene.analysis.BaseTokenStreamTestCase Java Examples