Java Code Examples for org.apache.lucene.analysis.MockTokenizer#setEnableChecks()
The following examples show how to use
org.apache.lucene.analysis.MockTokenizer#setEnableChecks() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestIDVersionPostingsFormat.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testMissingPayload() throws Exception { Directory dir = newDirectory(); // MockAnalyzer minus maybePayload else it sometimes stuffs in an 8-byte payload! Analyzer a = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, true, 100); tokenizer.setEnableChecks(true); MockTokenFilter filt = new MockTokenFilter(tokenizer, MockTokenFilter.EMPTY_STOPSET); return new TokenStreamComponents(tokenizer, filt); } }; IndexWriterConfig iwc = newIndexWriterConfig(a); iwc.setCodec(TestUtil.alwaysPostingsFormat(new IDVersionPostingsFormat())); RandomIndexWriter w = new RandomIndexWriter(random(), dir, iwc, false); Document doc = new Document(); doc.add(newTextField("id", "id", Field.Store.NO)); expectThrows(IllegalArgumentException.class, () -> { w.addDocument(doc); w.commit(false); }); w.close(); dir.close(); }
Example 2
Source File: TestCompoundWordTokenFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testReset() throws Exception { CharArraySet dict = makeDictionary("Rind", "Fleisch", "Draht", "Schere", "Gesetz", "Aufgabe", "Überwachung"); MockTokenizer wsTokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); wsTokenizer.setEnableChecks(false); // we will reset in a strange place wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz")); DictionaryCompoundWordTokenFilter tf = new DictionaryCompoundWordTokenFilter( wsTokenizer, dict, CompoundWordTokenFilterBase.DEFAULT_MIN_WORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MIN_SUBWORD_SIZE, CompoundWordTokenFilterBase.DEFAULT_MAX_SUBWORD_SIZE, false); CharTermAttribute termAtt = tf.getAttribute(CharTermAttribute.class); tf.reset(); assertTrue(tf.incrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); assertTrue(tf.incrementToken()); assertEquals("Rind", termAtt.toString()); tf.end(); tf.close(); wsTokenizer.setReader(new StringReader("Rindfleischüberwachungsgesetz")); tf.reset(); assertTrue(tf.incrementToken()); assertEquals("Rindfleischüberwachungsgesetz", termAtt.toString()); }
Example 3
Source File: TestLimitTokenPositionFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testMaxPosition3WithSynomyms() throws IOException { for (final boolean consumeAll : new boolean[]{true, false}) { MockTokenizer tokenizer = whitespaceMockTokenizer("one two three four five"); // if we are consuming all tokens, we can use the checks, otherwise we can't tokenizer.setEnableChecks(consumeAll); SynonymMap.Builder builder = new SynonymMap.Builder(true); builder.add(new CharsRef("one"), new CharsRef("first"), true); builder.add(new CharsRef("one"), new CharsRef("alpha"), true); builder.add(new CharsRef("one"), new CharsRef("beguine"), true); CharsRefBuilder multiWordCharsRef = new CharsRefBuilder(); SynonymMap.Builder.join(new String[]{"and", "indubitably", "single", "only"}, multiWordCharsRef); builder.add(new CharsRef("one"), multiWordCharsRef.get(), true); SynonymMap.Builder.join(new String[]{"dopple", "ganger"}, multiWordCharsRef); builder.add(new CharsRef("two"), multiWordCharsRef.get(), true); SynonymMap synonymMap = builder.build(); @SuppressWarnings("deprecation") TokenStream stream = new SynonymFilter(tokenizer, synonymMap, true); stream = new LimitTokenPositionFilter(stream, 3, consumeAll); // "only", the 4th word of multi-word synonym "and indubitably single only" is not emitted, since its position is greater than 3. assertTokenStreamContents(stream, new String[]{"one", "first", "alpha", "beguine", "and", "two", "indubitably", "dopple", "three", "single", "ganger"}, new int[]{1, 0, 0, 0, 0, 1, 0, 0, 1, 0, 0}); } }
Example 4
Source File: TestLimitTokenPositionFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testMaxPosition1() throws Exception { for (final boolean consumeAll : new boolean[]{true, false}) { Reader reader = new StringReader("A1 B2 C3 D4 E5 F6"); MockTokenizer tokenizer = whitespaceMockTokenizer(reader); // if we are consuming all tokens, we can use the checks, otherwise we can't tokenizer.setEnableChecks(consumeAll); TokenStream stream = tokenizer; stream = tokenFilterFactory("LimitTokenPosition", LimitTokenPositionFilterFactory.MAX_TOKEN_POSITION_KEY, "1", LimitTokenPositionFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll) ).create(stream); assertTokenStreamContents(stream, new String[]{"A1"}); } }
Example 5
Source File: TestFingerprintFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testSingleToken() throws Exception { for (final boolean consumeAll : new boolean[] { true, false }) { MockTokenizer tokenizer = whitespaceMockTokenizer("A1"); tokenizer.setEnableChecks(consumeAll); TokenStream stream = new FingerprintFilter(tokenizer); assertTokenStreamContents(stream, new String[] { "A1" }); } }
Example 6
Source File: TestFingerprintFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testCustomSeparator() throws Exception { for (final boolean consumeAll : new boolean[] { true, false }) { MockTokenizer tokenizer = whitespaceMockTokenizer("B2 A1 C3 B2"); tokenizer.setEnableChecks(consumeAll); TokenStream stream = new FingerprintFilter(tokenizer, FingerprintFilter.DEFAULT_MAX_OUTPUT_TOKEN_SIZE, '_'); assertTokenStreamContents(stream, new String[] { "A1_B2_C3" }); } }
Example 7
Source File: TestFingerprintFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testMaxFingerprintSize() throws Exception { for (final boolean consumeAll : new boolean[] { true, false }) { MockTokenizer tokenizer = whitespaceMockTokenizer("B2 A1 C3 D4 E5 F6 G7 H1"); tokenizer.setEnableChecks(consumeAll); TokenStream stream = new FingerprintFilter(tokenizer, 4, ' '); assertTokenStreamContents(stream, new String[] {}); } }
Example 8
Source File: TestFingerprintFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testAllDupValues() throws Exception { for (final boolean consumeAll : new boolean[] { true, false }) { MockTokenizer tokenizer = whitespaceMockTokenizer("B2 B2"); tokenizer.setEnableChecks(consumeAll); TokenStream stream = new FingerprintFilter(tokenizer); assertTokenStreamContents(stream, new String[] { "B2" }); } }
Example 9
Source File: TestLimitTokenOffsetFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
public void test() throws Exception { for (final boolean consumeAll : new boolean[]{true, false}) { Reader reader = new StringReader("A1 B2 C3 D4 E5 F6"); MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setReader(reader); tokenizer.setEnableChecks(consumeAll); TokenStream stream = tokenizer; stream = tokenFilterFactory("LimitTokenOffset", LimitTokenOffsetFilterFactory.MAX_START_OFFSET, "3", LimitTokenOffsetFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll) ).create(stream); assertTokenStreamContents(stream, new String[]{"A1", "B2"}); } }
Example 10
Source File: TestIndexWriterExceptions.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testDocumentsWriterExceptionFailOneDoc() throws Exception { Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override public TokenStreamComponents createComponents(String fieldName) { MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setEnableChecks(false); // disable workflow checking as we forcefully close() in exceptional cases. return new TokenStreamComponents(tokenizer, new CrashingFilter(fieldName, tokenizer)); } }; for (int i = 0; i < 10; i++) { try (Directory dir = newDirectory(); final IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer) .setMaxBufferedDocs(-1) .setRAMBufferSizeMB(random().nextBoolean() ? 0.00001 : Integer.MAX_VALUE) .setMergePolicy(new FilterMergePolicy(NoMergePolicy.INSTANCE) { @Override public boolean keepFullyDeletedSegment(IOSupplier<CodecReader> readerIOSupplier) { return true; } }))) { Document doc = new Document(); doc.add(newField("contents", "here are some contents", DocCopyIterator.custom5)); writer.addDocument(doc); doc.add(newField("crash", "this should crash after 4 terms", DocCopyIterator.custom5)); doc.add(newField("other", "this will not get indexed", DocCopyIterator.custom5)); expectThrows(IOException.class, () -> { writer.addDocument(doc); }); writer.commit(); try (IndexReader reader = DirectoryReader.open(dir)) { assertEquals(2, reader.docFreq(new Term("contents", "here"))); assertEquals(2, reader.maxDoc()); assertEquals(1, reader.numDocs()); } } } }
Example 11
Source File: TestIndexWriterExceptions.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testExceptionJustBeforeFlush() throws IOException { Directory dir = newDirectory(); final AtomicBoolean doCrash = new AtomicBoolean(); Analyzer analyzer = new Analyzer(Analyzer.PER_FIELD_REUSE_STRATEGY) { @Override public TokenStreamComponents createComponents(String fieldName) { MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setEnableChecks(false); // disable workflow checking as we forcefully close() in exceptional cases. TokenStream stream = tokenizer; if (doCrash.get()) { stream = new CrashingFilter(fieldName, stream); } return new TokenStreamComponents(tokenizer, stream); } }; IndexWriter w = RandomIndexWriter.mockIndexWriter(random(), dir, newIndexWriterConfig(analyzer) .setMaxBufferedDocs(2), new TestPoint1()); Document doc = new Document(); doc.add(newTextField("field", "a field", Field.Store.YES)); w.addDocument(doc); Document crashDoc = new Document(); crashDoc.add(newTextField("crash", "do it on token 4", Field.Store.YES)); doCrash.set(true); expectThrows(IOException.class, () -> { w.addDocument(crashDoc); }); w.addDocument(doc); w.close(); dir.close(); }
Example 12
Source File: FuzzySuggesterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TokenStreamComponents createComponents(String fieldName) { MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); tokenizer.setEnableChecks(true); TokenStream next; if (numStopChars != 0) { next = new TokenEater(preserveHoles, tokenizer, numStopChars); } else { next = tokenizer; } return new TokenStreamComponents(tokenizer, next); }
Example 13
Source File: TestLimitTokenCountFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
public void test() throws Exception { for (final boolean consumeAll : new boolean[]{true, false}) { Reader reader = new StringReader("A1 B2 C3 D4 E5 F6"); MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setReader(reader); tokenizer.setEnableChecks(consumeAll); TokenStream stream = tokenizer; stream = tokenFilterFactory("LimitTokenCount", LimitTokenCountFilterFactory.MAX_TOKEN_COUNT_KEY, "3", LimitTokenCountFilterFactory.CONSUME_ALL_TOKENS_KEY, Boolean.toString(consumeAll) ).create(stream); assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"}); } }
Example 14
Source File: TestFingerprintFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
public void test() throws Exception { for (final boolean consumeAll : new boolean[]{true, false}) { Reader reader = new StringReader("A1 B2 A1 D4 C3"); MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setReader(reader); tokenizer.setEnableChecks(consumeAll); TokenStream stream = tokenizer; stream = tokenFilterFactory("Fingerprint", FingerprintFilterFactory.MAX_OUTPUT_TOKEN_SIZE_KEY, "256", FingerprintFilterFactory.SEPARATOR_KEY, "_" ).create(stream); assertTokenStreamContents(stream, new String[]{"A1_B2_C3_D4"}); } }
Example 15
Source File: TestLimitTokenCountFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void test() throws Exception { for (final boolean consumeAll : new boolean[]{true, false}) { MockTokenizer tokenizer = whitespaceMockTokenizer("A1 B2 C3 D4 E5 F6"); tokenizer.setEnableChecks(consumeAll); TokenStream stream = new LimitTokenCountFilter(tokenizer, 3, consumeAll); assertTokenStreamContents(stream, new String[]{"A1", "B2", "C3"}); } }
Example 16
Source File: TestHunspellStemFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
/** simple test for longestOnly option */ public void testLongestOnly() throws IOException { MockTokenizer tokenizer = whitespaceMockTokenizer("lucene is awesome"); tokenizer.setEnableChecks(true); HunspellStemFilter filter = new HunspellStemFilter(tokenizer, dictionary, true, true); assertTokenStreamContents(filter, new String[]{"lucene", "is", "awesome"}, new int[] {1, 1, 1}); }
Example 17
Source File: TestConcatenateGraphFilterFactory.java From lucene-solr with Apache License 2.0 | 5 votes |
public void test() throws Exception { for (final boolean consumeAll : new boolean[]{true, false}) { final String input = "A1 B2 A1 D4 C3"; Reader reader = new StringReader(input); MockTokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); tokenizer.setReader(reader); tokenizer.setEnableChecks(consumeAll); TokenStream stream = tokenizer; stream = tokenFilterFactory("ConcatenateGraph", "tokenSeparator", "\u001F" ).create(stream); assertTokenStreamContents(stream, new String[]{input.replace(' ', (char) ConcatenateGraphFilter.SEP_LABEL)}); } }
Example 18
Source File: MinHashFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
private static Tokenizer createMockShingleTokenizer(int shingleSize, String shingles) { MockTokenizer tokenizer = new MockTokenizer( new CharacterRunAutomaton(new RegExp("[^ \t\r\n]+([ \t\r\n]+[^ \t\r\n]+){" + (shingleSize - 1) + "}").toAutomaton()), true); tokenizer.setEnableChecks(true); if (shingles != null) { tokenizer.setReader(new StringReader(shingles)); } return tokenizer; }
Example 19
Source File: AnalyzingSuggesterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TokenStreamComponents createComponents(String fieldName) { MockTokenizer tokenizer = new MockTokenizer(MockUTF16TermAttributeImpl.UTF16_TERM_ATTRIBUTE_FACTORY, MockTokenizer.WHITESPACE, false, MockTokenizer.DEFAULT_MAX_TOKEN_LENGTH); tokenizer.setEnableChecks(true); TokenStream next; if (numStopChars != 0) { next = new TokenEater(preserveHoles, tokenizer, numStopChars); } else { next = tokenizer; } return new TokenStreamComponents(tokenizer, next); }
Example 20
Source File: MockTokenizerFactory.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public MockTokenizer create(AttributeFactory factory) { MockTokenizer t = new MockTokenizer(factory, pattern, false); t.setEnableChecks(enableChecks); return t; }