org.apache.lucene.analysis.Token Java Examples
The following examples show how to use
org.apache.lucene.analysis.Token.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFlattenGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testAlreadyFlatten() throws Exception { TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wtf", 1, 1, 0, 3), token("what", 0, 1, 0, 3), token("wow", 0, 1, 0, 3), token("the", 1, 1, 0, 3), token("that's", 0, 1, 0, 3), token("fudge", 1, 1, 0, 3), token("funny", 0, 1, 0, 3), token("happened", 1, 1, 4, 12) }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"wtf", "what", "wow", "the", "that's", "fudge", "funny", "happened"}, new int[] {0, 0, 0, 0, 0, 0, 0, 4}, new int[] {3, 3, 3, 3, 3, 3, 3, 12}, new int[] {1, 0, 0, 1, 0, 1, 0, 1}, new int[] {1, 1, 1, 1, 1, 1, 1, 1}, 12); }
Example #2
Source File: IndexHelper.java From document-management-system with GNU General Public License v2.0 | 6 votes |
@SuppressWarnings("unused") SetDictionary(String words, Analyzer analyzer) throws IOException { wordSet = new HashSet<String>(); if (words != null) { TokenStream tokenStream = analyzer.tokenStream(NodeDocument.TEXT_FIELD, new StringReader(words)); Token reusableToken = new Token(); Token nextToken = null; //while ((nextToken = tokenStream.next(reusableToken)) != null) { //String term = nextToken.term(); //if (term != null) { //wordSet.add(term); //} //} } }
Example #3
Source File: FixedShingleFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTrailingGraphsOfDifferingLengths() throws IOException { // a b:3/c d e f TokenStream ts = new CannedTokenStream( new Token("a", 0, 1), new Token("b", 1, 2, 3, 3), new Token("c", 0, 2, 3), new Token("d", 2, 3), new Token("e", 2, 3), new Token("f", 4, 5) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[]{ "a b f", "a c d", "c d e", "d e f"}); }
Example #4
Source File: TestField.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTextFieldString() throws Exception { Field fields[] = new Field[] { new TextField("foo", "bar", Field.Store.NO), new TextField("foo", "bar", Field.Store.YES) }; for (Field field : fields) { trySetByteValue(field); trySetBytesValue(field); trySetBytesRefValue(field); trySetDoubleValue(field); trySetIntValue(field); trySetFloatValue(field); trySetLongValue(field); trySetReaderValue(field); trySetShortValue(field); field.setStringValue("baz"); field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); assertEquals("baz", field.stringValue()); } }
Example #5
Source File: TestField.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTextFieldReader() throws Exception { Field field = new TextField("foo", new StringReader("bar")); trySetByteValue(field); trySetBytesValue(field); trySetBytesRefValue(field); trySetDoubleValue(field); trySetIntValue(field); trySetFloatValue(field); trySetLongValue(field); field.setReaderValue(new StringReader("foobar")); trySetShortValue(field); trySetStringValue(field); field.setTokenStream(new CannedTokenStream(new Token("foo", 0, 3))); assertNotNull(field.readerValue()); }
Example #6
Source File: AnalyzingSuggesterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTooManyExpansions() throws Exception { final Analyzer a = new Analyzer() { @Override protected TokenStreamComponents createComponents(String fieldName) { return new TokenStreamComponents(r -> {}, new CannedTokenStream( new Token("a", 0, 1), new Token("b", 0, 0, 1))); } }; Directory tempDir = getDirectory(); AnalyzingSuggester suggester = new AnalyzingSuggester(tempDir, "suggest", a, a, 0, 256, 1, true); suggester.build(new InputArrayIterator(new Input[] {new Input("a", 1)})); assertEquals("[a/1]", suggester.lookup("a", false, 1).toString()); IOUtils.close(a, tempDir); }
Example #7
Source File: TestTermAutomatonQuery.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTermDoesNotExist() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); doc.add(newTextField("field", "x y z", Field.Store.NO)); w.addDocument(doc); IndexReader r = w.getReader(); IndexSearcher s = newSearcher(r); TokenStream ts = new CannedTokenStream(new Token[] { token("a", 1, 1), }); TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts); // System.out.println("DOT: " + q.toDot()); assertEquals(0, s.search(q, 1).totalHits.value); w.close(); r.close(); dir.close(); }
Example #8
Source File: TestTermAutomatonQuery.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testOneTermDoesNotExist() throws Exception { Directory dir = newDirectory(); RandomIndexWriter w = new RandomIndexWriter(random(), dir); Document doc = new Document(); doc.add(newTextField("field", "x y z", Field.Store.NO)); w.addDocument(doc); IndexReader r = w.getReader(); IndexSearcher s = newSearcher(r); TokenStream ts = new CannedTokenStream(new Token[] { token("a", 1, 1), token("x", 1, 1), }); TermAutomatonQuery q = new TokenStreamToTermAutomatonQuery().toQuery("field", ts); // System.out.println("DOT: " + q.toDot()); assertEquals(0, s.search(q, 1).totalHits.value); IOUtils.close(w, r, dir); }
Example #9
Source File: TestPostingsOffsets.java From lucene-solr with Apache License 2.0 | 6 votes |
private void checkTokens(Token[] field1, Token[] field2) throws IOException { Directory dir = newDirectory(); RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc); boolean success = false; try { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Document doc = new Document(); doc.add(new Field("body", new CannedTokenStream(field1), ft)); doc.add(new Field("body", new CannedTokenStream(field2), ft)); riw.addDocument(doc); riw.close(); success = true; } finally { if (success) { IOUtils.close(dir); } else { IOUtils.closeWhileHandlingException(riw, dir); } } }
Example #10
Source File: FixedShingleFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testIncomingGraphs() throws IOException { // b/a c b/a d TokenStream ts = new CannedTokenStream( new Token("b", 0, 1), new Token("a", 0, 0, 1), new Token("c", 2, 3), new Token("b", 4, 5), new Token("a", 0, 4, 5), new Token("d", 6, 7) ); assertTokenStreamContents(new FixedShingleFilter(ts, 2), new String[] { "b c", "a c", "c b", "c a", "b d", "a d" }, new int[] { 0, 0, 2, 2, 4, 4 }, new int[] { 3, 3, 5, 5, 7, 7 }, new int[] { 1, 0, 1, 0, 1, 0 }); }
Example #11
Source File: TestPostingsOffsets.java From lucene-solr with Apache License 2.0 | 6 votes |
private void checkTokens(Token[] tokens) throws IOException { Directory dir = newDirectory(); RandomIndexWriter riw = new RandomIndexWriter(random(), dir, iwc); boolean success = false; try { FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Document doc = new Document(); doc.add(new Field("body", new CannedTokenStream(tokens), ft)); riw.addDocument(doc); riw.close(); success = true; } finally { if (success) { IOUtils.close(dir); } else { IOUtils.closeWhileHandlingException(riw, dir); } } }
Example #12
Source File: FixedShingleFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testWithStopwords() throws IOException { TokenStream ts = new CannedTokenStream( new Token("please", 0, 6), new Token("divide", 7, 13), new Token("sentence", 2, 19, 27), new Token("shingles", 2, 33, 41) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[]{"please divide _", "divide _ sentence", "sentence _ shingles"}, new int[]{0, 7, 19,}, new int[]{13, 27, 41,}, new String[]{"shingle", "shingle", "shingle",}, new int[]{1, 1, 2,}); }
Example #13
Source File: TestFieldInvertState.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testBasic() throws Exception { Directory dir = newDirectory(); IndexWriterConfig iwc = new IndexWriterConfig(new MockAnalyzer(random())); iwc.setSimilarity(NeverForgetsSimilarity.INSTANCE); IndexWriter w = new IndexWriter(dir, iwc); Document doc = new Document(); Field field = new Field("field", new CannedTokenStream(new Token("a", 0, 1), new Token("b", 2, 3), new Token("c", 4, 5)), TextField.TYPE_NOT_STORED); doc.add(field); w.addDocument(doc); FieldInvertState fis = NeverForgetsSimilarity.INSTANCE.lastState; assertEquals(1, fis.getMaxTermFrequency()); assertEquals(3, fis.getUniqueTermCount()); assertEquals(0, fis.getNumOverlap()); assertEquals(3, fis.getLength()); IOUtils.close(w, dir); }
Example #14
Source File: FixedShingleFilterTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testShinglesSpanningGraphs() throws IOException { TokenStream ts = new CannedTokenStream( new Token("b", 0, 1), new Token("a", 0, 0, 1), new Token("c", 2, 3), new Token("b", 4, 5), new Token("a", 0, 4, 5), new Token("d", 6, 7) ); assertTokenStreamContents(new FixedShingleFilter(ts, 3), new String[] { "b c b", "b c a", "a c b", "a c a", "c b d", "c a d" }, new int[] { 0, 0, 0, 0, 2, 2, }, new int[] { 5, 5, 5, 5, 7, 7, }, new int[] { 1, 0, 0, 0, 1, 0, }); }
Example #15
Source File: TestConcatenatingTokenStream.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testOffsetGaps() throws IOException { CannedTokenStream cts1 = new CannedTokenStream(2, 10, new Token("a", 0, 1), new Token("b", 2, 3)); CannedTokenStream cts2 = new CannedTokenStream(2, 10, new Token("c", 0, 1), new Token("d", 2, 3)); TokenStream ts = new ConcatenatingTokenStream(cts1, cts2); assertTokenStreamContents(ts, new String[] { "a", "b", "c", "d" }, new int[]{ 0, 2, 10, 12 }, new int[]{ 1, 3, 11, 13 }, null, new int[]{ 1, 1, 3, 1 }, null, 20, 2, null, false, null ); }
Example #16
Source File: TestFlattenGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testNonGraph() throws Exception { TokenStream in = new CannedTokenStream(0, 22, new Token[] { token("hello", 1, 1, 0, 5), token("pseudo", 1, 1, 6, 12), token("world", 1, 1, 13, 18), token("fun", 1, 1, 19, 22), }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"hello", "pseudo", "world", "fun"}, new int[] {0, 6, 13, 19}, new int[] {5, 12, 18, 22}, new int[] {1, 1, 1, 1}, new int[] {1, 1, 1, 1}, 22); }
Example #17
Source File: TestFlattenGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testSimpleHole() throws Exception { TokenStream in = new CannedTokenStream(0, 13, new Token[] { token("hello", 1, 1, 0, 5), token("hole", 2, 1, 6, 10), token("fun", 1, 1, 11, 13), }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"hello", "hole", "fun"}, new int[] {0, 6, 11}, new int[] {5, 10, 13}, new int[] {1, 2, 1}, new int[] {1, 1, 1}, 13); }
Example #18
Source File: TestFlattenGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testHoleUnderSyn() throws Exception { // Tests a StopFilter after SynFilter where a stopword in a syn is removed // // wizard of oz -> woz syn, but then "of" becomes a hole TokenStream in = new CannedTokenStream(0, 12, new Token[] { token("wizard", 1, 1, 0, 6), token("woz", 0, 3, 0, 12), token("oz", 2, 1, 10, 12), }); TokenStream out = new FlattenGraphFilter(in); assertTokenStreamContents(out, new String[] {"wizard", "woz", "oz"}, new int[] {0, 0, 10}, new int[] {6, 12, 12}, new int[] {1, 0, 2}, new int[] {1, 3, 1}, 12); }
Example #19
Source File: TestFlattenGraphFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testStrangelyNumberedNodes() throws Exception { // Uses only nodes 0, 2, 3, i.e. 1 is just never used (it is not a hole!!) TokenStream in = new CannedTokenStream(0, 27, new Token[] { token("dog", 1, 3, 0, 5), token("puppy", 0, 3, 0, 5), token("flies", 3, 1, 6, 11), }); TokenStream out = new FlattenGraphFilter(in); assertTokenStreamContents(out, new String[] {"dog", "puppy", "flies"}, new int[] {0, 0, 6}, new int[] {5, 5, 11}, new int[] {1, 0, 1}, new int[] {1, 1, 1}, 27); }
Example #20
Source File: TestProtectedTermFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testBasic() throws IOException { CannedTokenStream cts = new CannedTokenStream( new Token("Alice", 1, 0, 5), new Token("Bob", 1, 6, 9), new Token("Clara", 1, 10, 15), new Token("David", 1, 16, 21) ); CharArraySet protectedTerms = new CharArraySet(5, true); protectedTerms.add("bob"); TokenStream ts = new ProtectedTermFilter(protectedTerms, cts, LowerCaseFilter::new); assertTokenStreamContents(ts, new String[]{ "alice", "Bob", "clara", "david" }); }
Example #21
Source File: TestAsciiFoldingFilterFactory.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testMultiTermAnalysis() throws IOException { TokenFilterFactory factory = new ASCIIFoldingFilterFactory(Collections.emptyMap()); TokenStream stream = new CannedTokenStream(new Token("Été", 0, 3)); stream = factory.create(stream); assertTokenStreamContents(stream, new String[] { "Ete" }); stream = new CannedTokenStream(new Token("Été", 0, 3)); stream = factory.normalize(stream); assertTokenStreamContents(stream, new String[] { "Ete" }); factory = new ASCIIFoldingFilterFactory(new HashMap<>(Collections.singletonMap("preserveOriginal", "true"))); stream = new CannedTokenStream(new Token("Été", 0, 3)); stream = factory.create(stream); assertTokenStreamContents(stream, new String[] { "Ete", "Été" }); stream = new CannedTokenStream(new Token("Été", 0, 3)); stream = factory.normalize(stream); assertTokenStreamContents(stream, new String[] { "Ete" }); }
Example #22
Source File: TestTrimFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testTrim() throws Exception { char[] a = " a ".toCharArray(); char[] b = "b ".toCharArray(); char[] ccc = "cCc".toCharArray(); char[] whitespace = " ".toCharArray(); char[] empty = "".toCharArray(); TokenStream ts = new CannedTokenStream(new Token(new String(a, 0, a.length), 1, 5), new Token(new String(b, 0, b.length), 6, 10), new Token(new String(ccc, 0, ccc.length), 11, 15), new Token(new String(whitespace, 0, whitespace.length), 16, 20), new Token(new String(empty, 0, empty.length), 21, 21)); ts = new TrimFilter(ts); assertTokenStreamContents(ts, new String[] { "a", "b", "cCc", "", ""}); }
Example #23
Source File: TestRemoveDuplicatesTokenFilter.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testDups(final String expected, final Token... tokens) throws Exception { final Iterator<Token> toks = Arrays.asList(tokens).iterator(); final TokenStream ts = new RemoveDuplicatesTokenFilter( (new TokenStream() { CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); OffsetAttribute offsetAtt = addAttribute(OffsetAttribute.class); PositionIncrementAttribute posIncAtt = addAttribute(PositionIncrementAttribute.class); @Override public boolean incrementToken() { if (toks.hasNext()) { clearAttributes(); Token tok = toks.next(); termAtt.setEmpty().append(tok); offsetAtt.setOffset(tok.startOffset(), tok.endOffset()); posIncAtt.setPositionIncrement(tok.getPositionIncrement()); return true; } else { return false; } } })); assertTokenStreamContents(ts, expected.split("\\s")); }
Example #24
Source File: TestPostingsOffsets.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testLegalbutVeryLargeOffsets() throws Exception { Directory dir = newDirectory(); IndexWriter iw = new IndexWriter(dir, newIndexWriterConfig(null)); Document doc = new Document(); Token t1 = new Token("foo", 0, Integer.MAX_VALUE-500); if (random().nextBoolean()) { t1.setPayload(new BytesRef("test")); } Token t2 = new Token("foo", Integer.MAX_VALUE-500, Integer.MAX_VALUE); TokenStream tokenStream = new CannedTokenStream( new Token[] { t1, t2 } ); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setIndexOptions(IndexOptions.DOCS_AND_FREQS_AND_POSITIONS_AND_OFFSETS); // store some term vectors for the checkindex cross-check ft.setStoreTermVectors(true); ft.setStoreTermVectorPositions(true); ft.setStoreTermVectorOffsets(true); Field field = new Field("foo", tokenStream, ft); doc.add(field); iw.addDocument(doc); iw.close(); dir.close(); }
Example #25
Source File: ShingleFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testTrailingHole2() throws IOException { // Analyzing "purple wizard of", where of is removed as a // stopword leaving a trailing hole: Token[] inputTokens = new Token[] {createToken("purple", 0, 6), createToken("wizard", 7, 13)}; ShingleFilter filter = new ShingleFilter(new CannedTokenStream(1, 16, inputTokens), 2, 2); assertTokenStreamContents(filter, new String[] {"purple", "purple wizard", "wizard", "wizard _"}, new int[] {0, 0, 7, 7}, new int[] {6, 13, 13, 16}, new int[] {1, 0, 1, 0}, 16); }
Example #26
Source File: TestMultiPhraseQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * PQ AND Mode - Manually creating a phrase query */ public void testZeroPosIncrSloppyPqAnd() throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); int pos = -1; for (Token tap : INCR_0_QUERY_TOKENS_AND) { pos += tap.getPositionIncrement(); builder.add(new Term("field", tap.toString()), pos); } builder.setSlop(0); doTestZeroPosIncrSloppy(builder.build(), 0); builder.setSlop(1); doTestZeroPosIncrSloppy(builder.build(), 0); builder.setSlop(2); doTestZeroPosIncrSloppy(builder.build(), 1); }
Example #27
Source File: ShingleFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
protected void shingleFilterTest(String tokenSeparator, int minSize, int maxSize, Token[] tokensToShingle, Token[] tokensToCompare, int[] positionIncrements, String[] types, boolean outputUnigrams) throws IOException { ShingleFilter filter = new ShingleFilter(new CannedTokenStream(tokensToShingle), minSize, maxSize); filter.setTokenSeparator(tokenSeparator); filter.setOutputUnigrams(outputUnigrams); shingleFilterTestCommon(filter, tokensToCompare, positionIncrements, types); }
Example #28
Source File: LuceneTokenizer.java From semanticvectors with BSD 3-Clause "New" or "Revised" License | 5 votes |
/** * * @param string * @return arrayList of tokens of string converted to lowercase * @throws IOException */ public static ArrayList<String> tokenize(String string) throws IOException{ ArrayList<String> retList = new ArrayList<String>(); StringReader reader = new StringReader(string); StandardTokenizer tokenizer = new StandardTokenizer(); while(tokenizer.incrementToken()){ retList.add(tokenizer.getAttribute(Token.class).toString()); } tokenizer.close(); reader.close(); return retList; }
Example #29
Source File: ShingleFilterTest.java From lucene-solr with Apache License 2.0 | 5 votes |
private static Token createToken (String term, int start, int offset, int positionIncrement) { Token token = new Token(); token.setOffset(start, offset); token.copyBuffer(term.toCharArray(), 0, term.length()); token.setPositionIncrement(positionIncrement); return token; }
Example #30
Source File: TestFlattenGraphFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testNonGreedySynonyms() throws Exception { // This is just "hypothetical" for Lucene today, because SynFilter is // greedy: when two syn rules match on overlapping tokens, only one // (greedily) wins. This test pretends all syn matches could match: TokenStream in = new CannedTokenStream(0, 20, new Token[] { token("wizard", 1, 1, 0, 6), token("wizard_of_oz", 0, 3, 0, 12), token("of", 1, 1, 7, 9), token("oz", 1, 1, 10, 12), token("oz_screams", 0, 2, 10, 20), token("screams", 1, 1, 13, 20), }); TokenStream out = new FlattenGraphFilter(in); // ... but on output, it's flattened to wtf/what/wow that's/the fudge/funny happened: assertTokenStreamContents(out, new String[] {"wizard", "wizard_of_oz", "of", "oz", "oz_screams", "screams"}, new int[] {0, 0, 7, 10, 10, 13}, new int[] {6, 12, 9, 12, 20, 20}, new int[] {1, 0, 1, 1, 0, 1}, new int[] {1, 3, 1, 1, 2, 1}, 20); }