org.apache.lucene.analysis.BaseTokenStreamTestCase Java Examples
The following examples show how to use
org.apache.lucene.analysis.BaseTokenStreamTestCase.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0 | 6 votes |
/** variation sequence */ public void testEmojiVariationSequence() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣", new String[] { "#️⃣" }, new String[] { "<EMOJI>" }); BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣", new String[] { "3️⃣",}, new String[] { "<EMOJI>" }); // text presentation sequences BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E", new String[] { }, new String[] { }); BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend} new String[] { "3\uFE0E",}, new String[] { "<NUM>" }); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE new String[] { "\u2B55",}, new String[] { "<EMOJI>" }); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E", new String[] { "\u2B55", "\u200D\u2B55"}, new String[] { "<EMOJI>", "<EMOJI>" }); }
Example #2
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testMailtoSchemeEmails () throws Exception { // See LUCENE-3880 BaseTokenStreamTestCase.assertAnalyzesTo(a, "MAILTO:[email protected]", new String[] {"mailto", "[email protected]"}, new String[] { "<ALPHANUM>", "<EMAIL>" }); // TODO: Support full mailto: scheme URIs. See RFC 6068: http://tools.ietf.org/html/rfc6068 BaseTokenStreamTestCase.assertAnalyzesTo (a, "mailto:[email protected],[email protected][email protected]" + "&subject=Subjectivity&body=Corpusivity%20or%20something%20like%20that", new String[] { "mailto", "[email protected]", // TODO: recognize ',' address delimiter. Also, see examples of ';' delimiter use at: http://www.mailto.co.uk/ ",[email protected]", "[email protected]", // TODO: split field keys/values "subject", "subjectivity", "body", "corpusivity", "20or", "20something","20like", "20that" }, // TODO: Hex decoding + re-tokenization new String[] { "<ALPHANUM>", "<EMAIL>", "<EMAIL>", "<EMAIL>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>", "<ALPHANUM>" }); }
Example #3
Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0 | 6 votes |
/** variation sequence */ public void testEmojiVariationSequence() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "#️⃣", new String[] { "#️⃣" }, new String[] { "<EMOJI>" }); BaseTokenStreamTestCase.assertAnalyzesTo(a, "3️⃣", new String[] { "3️⃣",}, new String[] { "<EMOJI>" }); // text presentation sequences BaseTokenStreamTestCase.assertAnalyzesTo(a, "#\uFE0E", new String[] { }, new String[] { }); BaseTokenStreamTestCase.assertAnalyzesTo(a, "3\uFE0E", // \uFE0E is included in \p{WB:Extend} new String[] { "3\uFE0E",}, new String[] { "<NUM>" }); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E", // \u2B55 = HEAVY BLACK CIRCLE new String[] { "\u2B55",}, new String[] { "<EMOJI>" }); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\u2B55\uFE0E\u200D\u2B55\uFE0E", new String[] { "\u2B55", "\u200D\u2B55"}, new String[] { "<EMOJI>", "<EMOJI>" }); }
Example #4
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testNumericSA() throws Exception { // floating point, serial, model numbers, ip addresses, etc. BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"r2d2", "c3po"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); }
Example #5
Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testApostrophesSA() throws Exception { // internal apostrophes: O'Reilly, you're, O'Reilly's BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"}); }
Example #6
Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testLUCENE1545() throws Exception { /* * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTRE E. * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost. * Expected result is only on token "moͤchte". */ BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); }
Example #7
Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testNumericSA() throws Exception { // floating point, serial, model numbers, ip addresses, etc. BaseTokenStreamTestCase.assertAnalyzesTo(a, "21.35", new String[]{"21.35"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "R2D2 C3PO", new String[]{"R2D2", "C3PO"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "216.239.63.104", new String[]{"216.239.63.104"}); }
Example #8
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testVariousTextSA() throws Exception { // various BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"c", "embedded", "developers", "wanted"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "foo", "bar"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "foo", "bar"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"quoted", "word"}); }
Example #9
Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0 | 5 votes |
public void testPhoneticTokenFilterBeiderMorseNoLanguage() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilter"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("ABADIAS")); String[] expected = new String[] { "abYdias", "abYdios", "abadia", "abadiaS", "abadias", "abadio", "abadioS", "abadios", "abodia", "abodiaS", "abodias", "abodio", "abodioS", "abodios", "avadias", "avadios", "avodias", "avodios", "obadia", "obadiaS", "obadias", "obadio", "obadioS", "obadios", "obodia", "obodiaS", "obodias", "obodioS" }; BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }
Example #10
Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); StandardTokenizer tokenizer = new StandardTokenizer(); tokenizer.setReader(new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }
Example #11
Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0 | 5 votes |
public void testPhoneticTokenFilterBeiderMorseWithLanguage() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("beidermorsefilterfrench"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("Rimbault")); String[] expected = new String[] { "rimbD", "rimbDlt", "rimba", "rimbalt", "rimbo", "rimbolt", "rimbu", "rimbult", "rmbD", "rmbDlt", "rmba", "rmbalt", "rmbo", "rmbolt", "rmbu", "rmbult" }; BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }
Example #12
Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testApostrophesSA() throws Exception { // internal apostrophes: O'Reilly, you're, O'Reilly's BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"O'Reilly"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"Jim's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"O'Reilly's"}); }
Example #13
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testApostrophesSA() throws Exception { // internal apostrophes: O'Reilly, you're, O'Reilly's BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly", new String[]{"o'reilly"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "you're", new String[]{"you're"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "she's", new String[]{"she's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "Jim's", new String[]{"jim's"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "don't", new String[]{"don't"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "O'Reilly's", new String[]{"o'reilly's"}); }
Example #14
Source File: SimplePhoneticAnalysisTests.java From crate with Apache License 2.0 | 5 votes |
public void testPhoneticTokenFilterDaitchMotokoff() throws IOException { TokenFilterFactory filterFactory = analysis.tokenFilter.get("daitch_mokotoff"); Tokenizer tokenizer = new WhitespaceTokenizer(); tokenizer.setReader(new StringReader("chauptman")); String[] expected = new String[] { "473660", "573660" }; assertThat(filterFactory.create(tokenizer), instanceOf(DaitchMokotoffSoundexFilter.class)); BaseTokenStreamTestCase.assertTokenStreamContents(filterFactory.create(tokenizer), expected); }
Example #15
Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testHugeDoc() throws IOException { StringBuilder sb = new StringBuilder(); char whitespace[] = new char[4094]; Arrays.fill(whitespace, ' '); sb.append(whitespace); sb.append("testing 1234"); String input = sb.toString(); UAX29URLEmailTokenizer tokenizer = new UAX29URLEmailTokenizer(newAttributeFactory()); tokenizer.setReader(new StringReader(input)); BaseTokenStreamTestCase.assertTokenStreamContents(tokenizer, new String[] { "testing", "1234" }); }
Example #16
Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testVariousTextSA() throws Exception { // various BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"}); }
Example #17
Source File: TestUAX29URLEmailTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testVariousTextSA() throws Exception { // various BaseTokenStreamTestCase.assertAnalyzesTo(a, "C embedded developers wanted", new String[]{"C", "embedded", "developers", "wanted"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar FOO BAR", new String[]{"foo", "bar", "FOO", "BAR"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "foo bar . FOO <> BAR", new String[]{"foo", "bar", "FOO", "BAR"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "\"QUOTED\" word", new String[]{"QUOTED", "word"}); }
Example #18
Source File: ProtectedTermFilterFactoryTest.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testBasic() throws Exception { String text = "Wuthering FooBar distant goldeN ABC compote"; Map<String,String> args = new HashMap<>(); args.put("ignoreCase", "true"); args.put("protected", "protected-1.txt,protected-2.txt"); // Protected: foobar, jaxfopbuz, golden, compote args.put("wrappedFilters", "lowercase"); ResourceLoader loader = new SolrResourceLoader(TEST_PATH().resolve("collection1")); ProtectedTermFilterFactory factory = new ProtectedTermFilterFactory(args); factory.inform(loader); TokenStream ts = factory.create(whitespaceMockTokenizer(text)); BaseTokenStreamTestCase.assertTokenStreamContents(ts, new String[] { "wuthering", "FooBar", "distant", "goldeN", "abc", "compote" }); }
Example #19
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testLUCENE1545() throws Exception { /* * Standard analyzer does not correctly tokenize combining character U+0364 COMBINING LATIN SMALL LETTER E. * The word "moͤchte" is incorrectly tokenized into "mo" "chte", the combining character is lost. * Expected result is only one token "moͤchte". */ BaseTokenStreamTestCase.assertAnalyzesTo(a, "moͤchte", new String[] { "moͤchte" }); }
Example #20
Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testThai() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "การที่ได้ต้องแสดงว่างานดี. แล้วเธอจะไปไหน? ๑๒๓๔", new String[] { "การที่ได้ต้องแสดงว่างานดี", "แล้วเธอจะไปไหน", "๑๒๓๔" }); }
Example #21
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testTypes() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "david has 5000 bones", new String[] {"david", "has", "5000", "bones"}, new String[] { "<ALPHANUM>", "<ALPHANUM>", "<NUM>", "<ALPHANUM>" }); }
Example #22
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testOffsets() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[] {"david", "has", "5000", "bones"}, new int[] {0, 6, 10, 15}, new int[] {5, 9, 14, 20}); }
Example #23
Source File: TestStandardAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testFarsi() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "ویکی پدیای انگلیسی در تاریخ ۲۵ دی ۱۳۷۹ به صورت مکملی برای دانشنامهٔ تخصصی نوپدیا نوشته شد.", new String[] { "ویکی", "پدیای", "انگلیسی", "در", "تاریخ", "۲۵", "دی", "۱۳۷۹", "به", "صورت", "مکملی", "برای", "دانشنامهٔ", "تخصصی", "نوپدیا", "نوشته", "شد" }); }
Example #24
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testBasicEmails() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "one [email protected] two three [[email protected]] \"ArakaBanassaMassanaBakarA\" <[email protected]>", new String[] {"one", "[email protected]", "two", "three", "[email protected]", "arakabanassamassanabakara", "[email protected]",}, new String[] { "<ALPHANUM>", "<EMAIL>", "<ALPHANUM>", "<ALPHANUM>", "<EMAIL>", "<ALPHANUM>", "<EMAIL>" }); }
Example #25
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testKoreanSA() throws Exception { // Korean words BaseTokenStreamTestCase.assertAnalyzesTo(a, "안녕하세요 한글입니다", new String[]{"안녕하세요", "한글입니다"}); }
Example #26
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testTextWithNumbersSA() throws Exception { // numbers BaseTokenStreamTestCase.assertAnalyzesTo(a, "David has 5000 bones", new String[]{"david", "has", "5000", "bones"}); }
Example #27
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testDelimitersSA() throws Exception { // other delimiters: "-", "/", "," BaseTokenStreamTestCase.assertAnalyzesTo(a, "some-dashed-phrase", new String[]{"some", "dashed", "phrase"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "dogs,chase,cats", new String[]{"dogs", "chase", "cats"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "ac/dc", new String[]{"ac", "dc"}); }
Example #28
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testAlphanumericSA() throws Exception { // alphanumeric tokens BaseTokenStreamTestCase.assertAnalyzesTo(a, "B2B", new String[]{"b2b"}); BaseTokenStreamTestCase.assertAnalyzesTo(a, "2B", new String[]{"2b"}); }
Example #29
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testEmpty() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "", new String[] {}); BaseTokenStreamTestCase.assertAnalyzesTo(a, ".", new String[] {}); BaseTokenStreamTestCase.assertAnalyzesTo(a, " ", new String[] {}); }
Example #30
Source File: TestUAX29URLEmailAnalyzer.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testChinese() throws Exception { BaseTokenStreamTestCase.assertAnalyzesTo(a, "我是中国人。 1234 Tests ", new String[] { "我", "是", "中", "国", "人", "1234", "tests"}); }