org.apache.lucene.analysis.core.WhitespaceTokenizerFactory Java Examples
The following examples show how to use
org.apache.lucene.analysis.core.WhitespaceTokenizerFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testWhitespaceFactoryWithFolding() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() .withTokenizer(WhitespaceTokenizerFactory.class) .addTokenFilter(ASCIIFoldingFilterFactory.class, "preserveOriginal", "true") .addTokenFilter(LowerCaseFilterFactory.class) .build(); assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass()); assertEquals(Collections.emptyList(), a.getCharFilterFactories()); List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories(); assertEquals(2, tokenFilters.size()); assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass()); assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass()); assertEquals(0, a.getPositionIncrementGap("dummy")); assertEquals(1, a.getOffsetGap("dummy")); assertSame(Version.LATEST, a.getVersion()); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "foo", "bar" }, new int[] { 1, 1, 1, 1}); assertAnalyzesTo(a, "föó bär FÖÖ BAR", new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" }, new int[] { 1, 0, 1, 0, 1, 0, 1}); a.close(); }
Example #2
Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testWhitespaceWithFolding() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() .withTokenizer("whitespace") .addTokenFilter("asciifolding", "preserveOriginal", "true") .addTokenFilter("lowercase") .build(); assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass()); assertEquals(Collections.emptyList(), a.getCharFilterFactories()); List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories(); assertEquals(2, tokenFilters.size()); assertSame(ASCIIFoldingFilterFactory.class, tokenFilters.get(0).getClass()); assertSame(LowerCaseFilterFactory.class, tokenFilters.get(1).getClass()); assertEquals(0, a.getPositionIncrementGap("dummy")); assertEquals(1, a.getOffsetGap("dummy")); assertSame(Version.LATEST, a.getVersion()); assertAnalyzesTo(a, "foo bar FOO BAR", new String[] { "foo", "bar", "foo", "bar" }, new int[] { 1, 1, 1, 1}); assertAnalyzesTo(a, "föó bär FÖÖ BAR", new String[] { "foo", "föó", "bar", "bär", "foo", "föö", "bar" }, new int[] { 1, 0, 1, 0, 1, 0, 1}); a.close(); }
Example #3
Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testStopWordsFromClasspath() throws Exception { CustomAnalyzer a = CustomAnalyzer.builder() .withTokenizer(WhitespaceTokenizerFactory.class) .addTokenFilter("stop", "ignoreCase", "true", "words", "org/apache/lucene/analysis/custom/teststop.txt", "format", "wordset") .build(); assertSame(WhitespaceTokenizerFactory.class, a.getTokenizerFactory().getClass()); assertEquals(Collections.emptyList(), a.getCharFilterFactories()); List<TokenFilterFactory> tokenFilters = a.getTokenFilterFactories(); assertEquals(1, tokenFilters.size()); assertSame(StopFilterFactory.class, tokenFilters.get(0).getClass()); assertEquals(0, a.getPositionIncrementGap("dummy")); assertEquals(1, a.getOffsetGap("dummy")); assertSame(Version.LATEST, a.getVersion()); assertAnalyzesTo(a, "foo Foo Bar", new String[0]); a.close(); }
Example #4
Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testStopWordsFromClasspathWithMap() throws Exception { Map<String,String> stopConfig1 = new HashMap<>(); stopConfig1.put("ignoreCase", "true"); stopConfig1.put("words", "org/apache/lucene/analysis/custom/teststop.txt"); stopConfig1.put("format", "wordset"); Map<String,String> stopConfig2 = new HashMap<>(stopConfig1); Map<String,String> stopConfigImmutable = Collections.unmodifiableMap(new HashMap<>(stopConfig1)); CustomAnalyzer a = CustomAnalyzer.builder() .withTokenizer("whitespace") .addTokenFilter("stop", stopConfig1) .build(); assertTrue(stopConfig1.isEmpty()); assertAnalyzesTo(a, "foo Foo Bar", new String[0]); a = CustomAnalyzer.builder() .withTokenizer(WhitespaceTokenizerFactory.class) .addTokenFilter(StopFilterFactory.class, stopConfig2) .build(); assertTrue(stopConfig2.isEmpty()); assertAnalyzesTo(a, "foo Foo Bar", new String[0]); // try with unmodifiableMap, should fail expectThrows(UnsupportedOperationException.class, () -> { CustomAnalyzer.builder() .withTokenizer("whitespace") .addTokenFilter("stop", stopConfigImmutable) .build(); }); a.close(); }
Example #5
Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testNormalizationWithMultipleTokenFilters() throws IOException { CustomAnalyzer analyzer = CustomAnalyzer.builder() // none of these components are multi-term aware so they should not be applied .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap()) .addTokenFilter(LowerCaseFilterFactory.class, Collections.emptyMap()) .addTokenFilter(ASCIIFoldingFilterFactory.class, Collections.emptyMap()) .build(); assertEquals(new BytesRef("a b e"), analyzer.normalize("dummy", "À B é")); }
Example #6
Source File: TestCustomAnalyzer.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testNormalizationWithMultiplCharFilters() throws IOException { CustomAnalyzer analyzer = CustomAnalyzer.builder() // none of these components are multi-term aware so they should not be applied .withTokenizer(WhitespaceTokenizerFactory.class, Collections.emptyMap()) .addCharFilter(MappingCharFilterFactory.class, new HashMap<>(Collections.singletonMap("mapping", "org/apache/lucene/analysis/custom/mapping1.txt"))) .addCharFilter(MappingCharFilterFactory.class, new HashMap<>(Collections.singletonMap("mapping", "org/apache/lucene/analysis/custom/mapping2.txt"))) .build(); assertEquals(new BytesRef("e f c"), analyzer.normalize("dummy", "a b c")); }
Example #7
Source File: TestAbstractAnalysisFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testLookupTokenizerSPIName() throws NoSuchFieldException, IllegalAccessException { assertEquals("whitespace", AnalysisSPILoader.lookupSPIName(WhitespaceTokenizerFactory.class)); assertEquals("whitespace", TokenizerFactory.findSPIName(WhitespaceTokenizerFactory.class)); }
Example #8
Source File: TestAnalysisSPILoader.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testLookupTokenizer() { assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("Whitespace", versionArgOnly()).getClass()); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("WHITESPACE", versionArgOnly()).getClass()); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.forName("whitespace", versionArgOnly()).getClass()); }
Example #9
Source File: TestAnalysisSPILoader.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testLookupTokenizerClass() { assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("Whitespace")); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("WHITESPACE")); assertSame(WhitespaceTokenizerFactory.class, TokenizerFactory.lookupClass("whitespace")); }
Example #10
Source File: ReSearcherUtils.java From solr-researcher with Apache License 2.0 | 4 votes |
/** * Separates tokens from query. Treats each quote as a separate token, since that makes it easier to examine the query. * * @param queryString . * @param tokens . * @return number of quotes in the query */ public static int tokenizeQueryString(String queryString, List<String> tokens) { int countOfQuotes = 0; try { // first tokenize words and treat each quote as a separate token Map<String,String> args = new HashMap<String, String>(); args.put(WhitespaceTokenizerFactory.LUCENE_MATCH_VERSION_PARAM, Version.LUCENE_6_3_0.toString()); WhitespaceTokenizerFactory f = new WhitespaceTokenizerFactory(args); WhitespaceTokenizer s = (WhitespaceTokenizer)f.create(AttributeFactory.DEFAULT_ATTRIBUTE_FACTORY); s.setReader(new StringReader(queryString)); s.reset(); while (true) { CharTermAttribute t = s.getAttribute(CharTermAttribute.class); if (t == null) { break; } String tokentText = new String(t.toString()); if (tokentText.equals("\"")) { tokens.add("\""); countOfQuotes++; } else if (tokentText.startsWith("\"")) { tokens.add("\""); countOfQuotes++; if (tokentText.endsWith("\"")) { tokens.add(tokentText.substring(1, tokentText.length() - 1)); tokens.add("\""); countOfQuotes++; } else { tokens.add(tokentText.substring(1)); } } else if (tokentText.endsWith("\"")) { tokens.add(tokentText.substring(0, tokentText.length() - 1)); tokens.add("\""); countOfQuotes++; } else if (!tokentText.trim().equals("")) { // take into account only if different than empty string tokens.add(tokentText); } if (!s.incrementToken()) { break; } } s.end(); s.close(); } catch (IOException e) { throw new RuntimeException(e); } return countOfQuotes; }