Java Code Examples for org.apache.lucene.util.AttributeSource#State
The following examples show how to use
org.apache.lucene.util.AttributeSource#State .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestTermAutomatonQuery.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public boolean incrementToken() throws IOException { if (synNext) { AttributeSource.State state = captureState(); clearAttributes(); restoreState(state); posIncAtt.setPositionIncrement(0); termAtt.append(""+((char) 97 + random().nextInt(3))); synNext = false; return true; } if (input.incrementToken()) { if (random().nextInt(10) == 8) { synNext = true; } return true; } else { return false; } }
Example 2
Source File: TeeSinkTokenFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final boolean incrementToken() { if (!it.hasNext()) { return false; } AttributeSource.State state = it.next(); restoreState(state); return true; }
Example 3
Source File: WikipediaTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final boolean incrementToken() throws IOException { if (tokens != null && tokens.hasNext()){ AttributeSource.State state = tokens.next(); restoreState(state); return true; } clearAttributes(); int tokenType = scanner.getNextToken(); if (tokenType == WikipediaTokenizerImpl.YYEOF) { return false; } String type = WikipediaTokenizerImpl.TOKEN_TYPES[tokenType]; if (tokenOutput == TOKENS_ONLY || untokenizedTypes.contains(type) == false){ setupToken(); } else if (tokenOutput == UNTOKENIZED_ONLY && untokenizedTypes.contains(type) == true){ collapseTokens(tokenType); } else if (tokenOutput == BOTH){ //collapse into a single token, add it to tokens AND output the individual tokens //output the untokenized Token first collapseAndSaveTokens(tokenType, type); } int posinc = scanner.getPositionIncrement(); if (first && posinc == 0) { posinc = 1; // don't emit posinc=0 for the first token! } posIncrAtt.setPositionIncrement(posinc); typeAtt.setType(type); first = false; return true; }
Example 4
Source File: WikipediaTokenizer.java From lucene-solr with Apache License 2.0 | 5 votes |
private void collapseAndSaveTokens(int tokenType, String type) throws IOException { //collapse StringBuilder buffer = new StringBuilder(32); int numAdded = scanner.setText(buffer); //TODO: how to know how much whitespace to add int theStart = scanner.yychar(); int lastPos = theStart + numAdded; int tmpTokType; int numSeen = 0; List<AttributeSource.State> tmp = new ArrayList<>(); setupSavedToken(0, type); tmp.add(captureState()); //while we can get a token and that token is the same type and we have not transitioned to a new wiki-item of the same type while ((tmpTokType = scanner.getNextToken()) != WikipediaTokenizerImpl.YYEOF && tmpTokType == tokenType && scanner.getNumWikiTokensSeen() > numSeen){ int currPos = scanner.yychar(); //append whitespace for (int i = 0; i < (currPos - lastPos); i++){ buffer.append(' '); } numAdded = scanner.setText(buffer); setupSavedToken(scanner.getPositionIncrement(), type); tmp.add(captureState()); numSeen++; lastPos = currPos + numAdded; } //trim the buffer // TODO: this is inefficient String s = buffer.toString().trim(); termAtt.setEmpty().append(s); offsetAtt.setOffset(correctOffset(theStart), correctOffset(theStart + s.length())); flagsAtt.setFlags(UNTOKENIZED_TOKEN_FLAG); //The way the loop is written, we will have proceeded to the next token. We need to pushback the scanner to lastPos if (tmpTokType != WikipediaTokenizerImpl.YYEOF){ scanner.yypushback(scanner.yylength()); } tokens = tmp.iterator(); }
Example 5
Source File: WordDelimiterFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override protected void swap(int i, int j) { AttributeSource.State tmp = buffered[i]; buffered[i] = buffered[j]; buffered[j] = tmp; int tmp2 = startOff[i]; startOff[i] = startOff[j]; startOff[j] = tmp2; tmp2 = posInc[i]; posInc[i] = posInc[j]; posInc[j] = tmp2; }
Example 6
Source File: PreAnalyzedField.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final boolean incrementToken() { if (!it.hasNext()) { return false; } AttributeSource.State state = it.next(); restoreState(state.clone()); // TODO: why can't I lookup the OffsetAttribute up in ctor instead? lastEndOffset = addAttribute(OffsetAttribute.class).endOffset(); return true; }
Example 7
Source File: Zemberek2DeASCIIfyFilterFactory.java From lucene-solr-analysis-turkish with Apache License 2.0 | 5 votes |
private boolean createToken(String synonym, AttributeSource.State current) { restoreState(current); termAttribute.setEmpty().append(synonym); typeAtt.setType(DEASCII_TOKEN_TYPE); posIncrAtt.setPositionIncrement(0); return true; }
Example 8
Source File: LookaheadTokenFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void add(AttributeSource.State state) { inputTokens.add(state); }
Example 9
Source File: LookaheadTokenFilter.java From lucene-solr with Apache License 2.0 | 4 votes |
public AttributeSource.State nextState() { assert nextRead < inputTokens.size(); return inputTokens.get(nextRead++); }
Example 10
Source File: TestDocumentWriter.java From lucene-solr with Apache License 2.0 | 4 votes |
public void testTokenReuse() throws IOException { Analyzer analyzer = new Analyzer() { @Override public TokenStreamComponents createComponents(String fieldName) { Tokenizer tokenizer = new MockTokenizer(MockTokenizer.WHITESPACE, false); return new TokenStreamComponents(tokenizer, new TokenFilter(tokenizer) { boolean first = true; AttributeSource.State state; @Override public boolean incrementToken() throws IOException { if (state != null) { restoreState(state); payloadAtt.setPayload(null); posIncrAtt.setPositionIncrement(0); termAtt.setEmpty().append("b"); state = null; return true; } boolean hasNext = input.incrementToken(); if (!hasNext) return false; if (Character.isDigit(termAtt.buffer()[0])) { posIncrAtt.setPositionIncrement(termAtt.buffer()[0] - '0'); } if (first) { // set payload on first position only payloadAtt.setPayload(new BytesRef(new byte[]{100})); first = false; } // index a "synonym" for every token state = captureState(); return true; } @Override public void reset() throws IOException { super.reset(); first = true; state = null; } final CharTermAttribute termAtt = addAttribute(CharTermAttribute.class); final PayloadAttribute payloadAtt = addAttribute(PayloadAttribute.class); final PositionIncrementAttribute posIncrAtt = addAttribute(PositionIncrementAttribute.class); }); } }; IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(analyzer)); Document doc = new Document(); doc.add(newTextField("f1", "a 5 a a", Field.Store.YES)); writer.addDocument(doc); writer.commit(); SegmentCommitInfo info = writer.newestSegment(); writer.close(); SegmentReader reader = new SegmentReader(info, Version.LATEST.major, newIOContext(random())); PostingsEnum termPositions = MultiTerms.getTermPostingsEnum(reader, "f1", new BytesRef("a")); assertTrue(termPositions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); int freq = termPositions.freq(); assertEquals(3, freq); assertEquals(0, termPositions.nextPosition()); assertNotNull(termPositions.getPayload()); assertEquals(6, termPositions.nextPosition()); assertNull(termPositions.getPayload()); assertEquals(7, termPositions.nextPosition()); assertNull(termPositions.getPayload()); reader.close(); }
Example 11
Source File: SimplePreAnalyzedParser.java From lucene-solr with Apache License 2.0 | 4 votes |
private static AttributeSource.State createState(AttributeSource a, Tok state, int tokenEnd) { a.clearAttributes(); CharTermAttribute termAtt = a.addAttribute(CharTermAttribute.class); char[] tokChars = state.token.toString().toCharArray(); termAtt.copyBuffer(tokChars, 0, tokChars.length); int tokenStart = tokenEnd - state.token.length(); for (Entry<String, String> e : state.attr.entrySet()) { String k = e.getKey(); if (k.equals("i")) { // position increment int incr = Integer.parseInt(e.getValue()); PositionIncrementAttribute posIncr = a.addAttribute(PositionIncrementAttribute.class); posIncr.setPositionIncrement(incr); } else if (k.equals("s")) { tokenStart = Integer.parseInt(e.getValue()); } else if (k.equals("e")) { tokenEnd = Integer.parseInt(e.getValue()); } else if (k.equals("y")) { TypeAttribute type = a.addAttribute(TypeAttribute.class); type.setType(e.getValue()); } else if (k.equals("f")) { FlagsAttribute flags = a.addAttribute(FlagsAttribute.class); int f = Integer.parseInt(e.getValue(), 16); flags.setFlags(f); } else if (k.equals("p")) { PayloadAttribute p = a.addAttribute(PayloadAttribute.class); byte[] data = hexToBytes(e.getValue()); if (data != null && data.length > 0) { p.setPayload(new BytesRef(data)); } } else { // unknown attribute } } // handle offset attr OffsetAttribute offset = a.addAttribute(OffsetAttribute.class); offset.setOffset(tokenStart, tokenEnd); State resState = a.captureState(); a.clearAttributes(); return resState; }