Java Code Examples for org.apache.lucene.analysis.MockTokenizer#KEYWORD
The following examples show how to use
org.apache.lucene.analysis.MockTokenizer#KEYWORD .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FuzzySuggesterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testRandomEdits() throws IOException { List<Input> keys = new ArrayList<>(); int numTerms = atLeast(100); for (int i = 0; i < numTerms; i++) { keys.add(new Input("boo" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); } keys.add(new Input("foo bar boo far", 12)); MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); Directory tempDir = getDirectory(); FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, FuzzySuggester.DEFAULT_UNICODE_AWARE); suggester.build(new InputArrayIterator(keys)); int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { String addRandomEdit = addRandomEdit("foo bar boo", FuzzySuggester.DEFAULT_NON_FUZZY_PREFIX); List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); assertEquals(addRandomEdit, 1, results.size()); assertEquals("foo bar boo far", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); } IOUtils.close(analyzer, tempDir); }
Example 2
Source File: FuzzySuggesterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testNonLatinRandomEdits() throws IOException { List<Input> keys = new ArrayList<>(); int numTerms = atLeast(100); for (int i = 0; i < numTerms; i++) { keys.add(new Input("буу" + TestUtil.randomSimpleString(random()), 1 + random().nextInt(100))); } keys.add(new Input("фуу бар буу фар", 12)); MockAnalyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); Directory tempDir = getDirectory(); FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy",analyzer, analyzer, FuzzySuggester.EXACT_FIRST | FuzzySuggester.PRESERVE_SEP, 256, -1, true, FuzzySuggester.DEFAULT_MAX_EDITS, FuzzySuggester.DEFAULT_TRANSPOSITIONS, 0, FuzzySuggester.DEFAULT_MIN_FUZZY_LENGTH, true); suggester.build(new InputArrayIterator(keys)); int numIters = atLeast(10); for (int i = 0; i < numIters; i++) { String addRandomEdit = addRandomEdit("фуу бар буу", 0); List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence(addRandomEdit, random()), false, 2); assertEquals(addRandomEdit, 1, results.size()); assertEquals("фуу бар буу фар", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); } IOUtils.close(analyzer, tempDir); }
Example 3
Source File: MockTokenizerFactory.java From lucene-solr with Apache License 2.0 | 6 votes |
/** Creates a new MockTokenizerFactory */ public MockTokenizerFactory(Map<String,String> args) { super(args); String patternArg = get(args, "pattern", Arrays.asList("keyword", "simple", "whitespace")); if ("keyword".equalsIgnoreCase(patternArg)) { pattern = MockTokenizer.KEYWORD; } else if ("simple".equalsIgnoreCase(patternArg)) { pattern = MockTokenizer.SIMPLE; } else { pattern = MockTokenizer.WHITESPACE; } enableChecks = getBoolean(args, "enableChecks", true); if (!args.isEmpty()) { throw new IllegalArgumentException("Unknown parameters: " + args); } }
Example 4
Source File: TestBeiderMorseFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testCustomAttribute() throws IOException { TokenStream stream = new MockTokenizer(MockTokenizer.KEYWORD, false); ((Tokenizer)stream).setReader(new StringReader("D'Angelo")); stream = new PatternKeywordMarkerFilter(stream, Pattern.compile(".*")); stream = new BeiderMorseFilter(stream, new PhoneticEngine(NameType.GENERIC, RuleType.EXACT, true)); KeywordAttribute keyAtt = stream.addAttribute(KeywordAttribute.class); stream.reset(); int i = 0; while(stream.incrementToken()) { assertTrue(keyAtt.isKeyword()); i++; } assertEquals(12, i); stream.end(); stream.close(); }
Example 5
Source File: WordDelimiterFilter2Tests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void doSplitPossessive(int stemPossessive, final String input, final String... output) throws Exception { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS; flags |= (stemPossessive == 1) ? STEM_ENGLISH_POSSESSIVE : 0; Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); tokenizer.setReader(new StringReader(input)); WordDelimiterFilter2 wdf = new WordDelimiterFilter2(tokenizer, flags, null); assertTokenStreamContents(wdf, output); }
Example 6
Source File: TestMoreLikeThis.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testMultiValues() throws Exception { Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); mlt.setAnalyzer(analyzer); mlt.setFieldNames(new String[] {"text"}); BooleanQuery query = (BooleanQuery) mlt.like("text", new StringReader("lucene"), new StringReader("lucene release"), new StringReader("apache"), new StringReader("apache lucene")); Collection<BooleanClause> clauses = query.clauses(); assertEquals("Expected 2 clauses only!", 2, clauses.size()); for (BooleanClause clause : clauses) { Term term = ((TermQuery) clause.getQuery()).getTerm(); assertTrue(Arrays.asList(new Term("text", "lucene"), new Term("text", "apache")).contains(term)); } }
Example 7
Source File: WordDelimiterFilter2Tests.java From elasticsearch-plugin-bundle with GNU Affero General Public License v3.0 | 5 votes |
public void doSplit(final String input, String... output) throws Exception { int flags = GENERATE_WORD_PARTS | GENERATE_NUMBER_PARTS | SPLIT_ON_CASE_CHANGE | SPLIT_ON_NUMERICS | STEM_ENGLISH_POSSESSIVE; Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); tokenizer.setReader(new StringReader(input)); WordDelimiterFilter2 wdf = new WordDelimiterFilter2(tokenizer, WordDelimiterIterator.DEFAULT_WORD_DELIM_TABLE, flags, null); assertTokenStreamContents(wdf, output); }
Example 8
Source File: TestFuzzyQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
public void test2() throws Exception { Directory directory = newDirectory(); RandomIndexWriter writer = new RandomIndexWriter(random(), directory, new MockAnalyzer(random(), MockTokenizer.KEYWORD, false)); addDoc("LANGE", writer); addDoc("LUETH", writer); addDoc("PIRSING", writer); addDoc("RIEGEL", writer); addDoc("TRZECZIAK", writer); addDoc("WALKER", writer); addDoc("WBR", writer); addDoc("WE", writer); addDoc("WEB", writer); addDoc("WEBE", writer); addDoc("WEBER", writer); addDoc("WEBERE", writer); addDoc("WEBREE", writer); addDoc("WEBEREI", writer); addDoc("WBRE", writer); addDoc("WITTKOPF", writer); addDoc("WOJNAROWSKI", writer); addDoc("WRICKE", writer); IndexReader reader = writer.getReader(); IndexSearcher searcher = newSearcher(reader); writer.close(); FuzzyQuery query = new FuzzyQuery(new Term("field", "WEBER"), 2, 1); //query.setRewriteMethod(FuzzyQuery.SCORING_BOOLEAN_QUERY_REWRITE); ScoreDoc[] hits = searcher.search(query, 1000).scoreDocs; assertEquals(8, hits.length); reader.close(); directory.close(); }
Example 9
Source File: TestSimpleQueryParser.java From lucene-solr with Apache License 2.0 | 5 votes |
/** helper to parse a query with keyword analyzer across "field" */ private Query parseKeyword(String text, int flags) { Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); SimpleQueryParser parser = new SimpleQueryParser(analyzer, Collections.singletonMap("field", 1f), flags); return parser.parse(text); }
Example 10
Source File: TestTrimFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
/** blast some random strings through the analyzer */ public void testRandomStrings() throws Exception { Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); return new TokenStreamComponents(tokenizer, new TrimFilter(tokenizer)); } }; checkRandomData(random(), a, 200 * RANDOM_MULTIPLIER); a.close(); }
Example 11
Source File: TestCapitalizationFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
static void assertCapitalizesToKeyword(String input, String expected, boolean onlyFirstWord, CharArraySet keep, boolean forceFirstLetter, Collection<char[]> okPrefix, int minWordLength, int maxWordCount, int maxTokenLength) throws IOException { final MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); tokenizer.setReader(new StringReader(input)); assertCapitalizesTo(tokenizer, new String[] { expected }, onlyFirstWord, keep, forceFirstLetter, okPrefix, minWordLength, maxWordCount, maxTokenLength); }
Example 12
Source File: TestPorterStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public void setUp() throws Exception { super.setUp(); a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer( MockTokenizer.KEYWORD, false); return new TokenStreamComponents(t, new PorterStemFilter(t)); } }; }
Example 13
Source File: TestSoraniNormalizationFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public void setUp() throws Exception { super.setUp(); a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.KEYWORD, false); return new TokenStreamComponents(tokenizer, new SoraniNormalizationFilter(tokenizer)); } }; }
Example 14
Source File: TestGermanStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public void setUp() throws Exception { super.setUp(); analyzer = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { Tokenizer t = new MockTokenizer(MockTokenizer.KEYWORD, false); return new TokenStreamComponents(t, new GermanStemFilter(new LowerCaseFilter(t))); } }; }
Example 15
Source File: TestJapaneseIterationMarkCharFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testIterationMarksWithKeywordTokenizer() throws IOException { final String text = "時々馬鹿々々しいところゞゝゝミスヾ"; JapaneseIterationMarkCharFilterFactory filterFactory = new JapaneseIterationMarkCharFilterFactory(new HashMap<String,String>()); Reader filter = filterFactory.create(new StringReader(text)); TokenStream tokenStream = new MockTokenizer(MockTokenizer.KEYWORD, false); ((Tokenizer)tokenStream).setReader(filter); assertTokenStreamContents(tokenStream, new String[]{"時時馬鹿馬鹿しいところどころミスズ"}); }
Example 16
Source File: FuzzySuggesterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEmpty() throws Exception { Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); Directory tempDir = getDirectory(); FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy", analyzer); suggester.build(new InputArrayIterator(new Input[0])); List<LookupResult> result = suggester.lookup("a", false, 20); assertTrue(result.isEmpty()); IOUtils.close(analyzer, tempDir); }
Example 17
Source File: AnalyzingSuggesterTest.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testKeywordWithPayloads() throws Exception { Iterable<Input> keys = shuffle( new Input("foo", 50, new BytesRef("hello")), new Input("bar", 10, new BytesRef("goodbye")), new Input("barbar", 12, new BytesRef("thank you")), new Input("bar", 9, new BytesRef("should be deduplicated")), new Input("bar", 8, new BytesRef("should also be deduplicated")), new Input("barbara", 6, new BytesRef("for all the fish"))); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); Directory tempDir = getDirectory(); AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", analyzer); suggester.build(new InputArrayIterator(keys)); for (int i = 0; i < 2; i++) { // top N of 2, but only foo is available List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2); assertEquals(1, results.size()); assertEquals("foo", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); assertEquals(new BytesRef("hello"), results.get(0).payload); // top N of 1 for 'bar': we return this even though // barbar is higher because exactFirst is enabled: results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1); assertEquals(1, results.size()); assertEquals("bar", results.get(0).key.toString()); assertEquals(10, results.get(0).value, 0.01F); assertEquals(new BytesRef("goodbye"), results.get(0).payload); // top N Of 2 for 'b' results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals(new BytesRef("thank you"), results.get(0).payload); assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); assertEquals(new BytesRef("goodbye"), results.get(1).payload); // top N of 3 for 'ba' results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3); assertEquals(3, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals(new BytesRef("thank you"), results.get(0).payload); assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); assertEquals(new BytesRef("goodbye"), results.get(1).payload); assertEquals("barbara", results.get(2).key.toString()); assertEquals(6, results.get(2).value, 0.01F); assertEquals(new BytesRef("for all the fish"), results.get(2).payload); } IOUtils.close(analyzer, tempDir); }
Example 18
Source File: AnalyzingSuggesterTest.java From lucene-solr with Apache License 2.0 | 4 votes |
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { Iterable<Input> keys = shuffle( new Input("foo", 50), new Input("bar", 10), new Input("barbar", 10), new Input("barbar", 12), new Input("barbara", 6), new Input("bar", 5), new Input("barbara", 1) ); Directory tempDir = getDirectory(); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", analyzer); suggester.build(new InputArrayIterator(keys)); // top N of 2, but only foo is available List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2); assertEquals(1, results.size()); assertEquals("foo", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // top N of 1 for 'bar': we return this even though // barbar is higher because exactFirst is enabled: results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1); assertEquals(1, results.size()); assertEquals("bar", results.get(0).key.toString()); assertEquals(10, results.get(0).value, 0.01F); // top N Of 2 for 'b' results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); // top N of 3 for 'ba' results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3); assertEquals(3, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); assertEquals("barbara", results.get(2).key.toString()); assertEquals(6, results.get(2).value, 0.01F); IOUtils.close(analyzer, tempDir); }
Example 19
Source File: FuzzySuggesterTest.java From lucene-solr with Apache License 2.0 | 4 votes |
/** this is basically the WFST test ported to KeywordAnalyzer. so it acts the same */ public void testKeyword() throws Exception { Input keys[] = new Input[] { new Input("foo", 50), new Input("bar", 10), new Input("barbar", 12), new Input("barbara", 6) }; Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.KEYWORD, false); Directory tempDir = getDirectory(); FuzzySuggester suggester = new FuzzySuggester(tempDir, "fuzzy",analyzer); suggester.build(new InputArrayIterator(keys)); List<LookupResult> results = suggester.lookup(TestUtil.stringToCharSequence("bariar", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); results = suggester.lookup(TestUtil.stringToCharSequence("barbr", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); results = suggester.lookup(TestUtil.stringToCharSequence("barbara", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbara", results.get(0).key.toString()); assertEquals(6, results.get(0).value, 0.01F); results = suggester.lookup(TestUtil.stringToCharSequence("barbar", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("barbara", results.get(1).key.toString()); assertEquals(6, results.get(1).value, 0.01F); results = suggester.lookup(TestUtil.stringToCharSequence("barbaa", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("barbara", results.get(1).key.toString()); assertEquals(6, results.get(1).value, 0.01F); // top N of 2, but only foo is available results = suggester.lookup(TestUtil.stringToCharSequence("f", random()), false, 2); assertEquals(1, results.size()); assertEquals("foo", results.get(0).key.toString()); assertEquals(50, results.get(0).value, 0.01F); // top N of 1 for 'bar': we return this even though // barbar is higher because exactFirst is enabled: results = suggester.lookup(TestUtil.stringToCharSequence("bar", random()), false, 1); assertEquals(1, results.size()); assertEquals("bar", results.get(0).key.toString()); assertEquals(10, results.get(0).value, 0.01F); // top N Of 2 for 'b' results = suggester.lookup(TestUtil.stringToCharSequence("b", random()), false, 2); assertEquals(2, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); // top N of 3 for 'ba' results = suggester.lookup(TestUtil.stringToCharSequence("ba", random()), false, 3); assertEquals(3, results.size()); assertEquals("barbar", results.get(0).key.toString()); assertEquals(12, results.get(0).value, 0.01F); assertEquals("bar", results.get(1).key.toString()); assertEquals(10, results.get(1).value, 0.01F); assertEquals("barbara", results.get(2).key.toString()); assertEquals(6, results.get(2).value, 0.01F); IOUtils.close(analyzer, tempDir); }