Java Code Examples for org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement()
The following examples show how to use
org.apache.lucene.analysis.tokenattributes.PositionIncrementAttribute#getPositionIncrement() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SimpleSynonymMap.java From elasticsearch-dynamic-synonym with Apache License 2.0 | 6 votes |
private Set<String> analyze(String text) throws IOException { Set<String> result = new HashSet<String>(); Analyzer analyzer = configuration.getAnalyzer(); try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token with posinc != 1"); } result.add(new String(termAtt.buffer(), 0, termAtt.length())); } ts.end(); return result; } }
Example 2
Source File: StringFieldType.java From crate with Apache License 2.0 | 6 votes |
@Override public Query phraseQuery(String field, TokenStream stream, int slop, boolean enablePosIncrements) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; stream.reset(); while (stream.incrementToken()) { if (enablePosIncrements) { position += posIncrAtt.getPositionIncrement(); } else { position += 1; } builder.add(new Term(field, termAtt.getBytesRef()), position); } return builder.build(); }
Example 3
Source File: ShingleAnalyzerWrapperTest.java From lucene-solr with Apache License 2.0 | 6 votes |
public void testShingleAnalyzerWrapperPhraseQuery() throws Exception { PhraseQuery.Builder builder = new PhraseQuery.Builder(); try (TokenStream ts = analyzer.tokenStream("content", "this sentence")) { int j = -1; PositionIncrementAttribute posIncrAtt = ts.addAttribute(PositionIncrementAttribute.class); CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); ts.reset(); while (ts.incrementToken()) { j += posIncrAtt.getPositionIncrement(); String termText = termAtt.toString(); builder.add(new Term("content", termText), j); } ts.end(); } PhraseQuery q = builder.build(); ScoreDoc[] hits = searcher.search(q, 1000).scoreDocs; int[] ranks = new int[] { 0 }; compareRanks(hits, ranks); }
Example 4
Source File: QueryBuilder.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Creates complex boolean query from the cached tokenstream contents */ protected Query analyzeMultiBoolean(String field, TokenStream stream, BooleanClause.Occur operator) throws IOException { BooleanQuery.Builder q = newBooleanQuery(); List<TermAndBoost> currentQuery = new ArrayList<>(); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); stream.reset(); while (stream.incrementToken()) { if (posIncrAtt.getPositionIncrement() != 0) { add(q, currentQuery, operator); currentQuery.clear(); } currentQuery.add(new TermAndBoost(new Term(field, termAtt.getBytesRef()), boostAtt.getBoost())); } add(q, currentQuery, operator); return q.build(); }
Example 5
Source File: QueryBuilder.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Creates simple phrase query from the cached tokenstream contents */ protected Query analyzePhrase(String field, TokenStream stream, int slop) throws IOException { PhraseQuery.Builder builder = new PhraseQuery.Builder(); builder.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); BoostAttribute boostAtt = stream.addAttribute(BoostAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; float phraseBoost = DEFAULT_BOOST; stream.reset(); while (stream.incrementToken()) { if (enablePositionIncrements) { position += posIncrAtt.getPositionIncrement(); } else { position += 1; } builder.add(new Term(field, termAtt.getBytesRef()), position); phraseBoost *= boostAtt.getBoost(); } PhraseQuery query = builder.build(); if (phraseBoost == DEFAULT_BOOST) { return query; } return new BoostQuery(query, phraseBoost); }
Example 6
Source File: TransportExtendedAnalyzeAction.java From elasticsearch-extended-analyze with Apache License 2.0 | 6 votes |
private List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> processAnalysis(TokenStream stream, Set<String> includeAttributes, boolean shortAttrName, int lastPosition, int lastOffset) throws IOException { List<ExtendedAnalyzeResponse.ExtendedAnalyzeToken> tokens = new ArrayList<>(); stream.reset(); //and each tokens output CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName))); } stream.end(); return tokens; }
Example 7
Source File: AnalysisRequestHandlerBase.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Analyzes the given TokenStream, collecting the Tokens it produces. * * @param tokenStream TokenStream to analyze * * @return List of tokens produced from the TokenStream */ private List<AttributeSource> analyzeTokenStream(TokenStream tokenStream) { final List<AttributeSource> tokens = new ArrayList<>(); final PositionIncrementAttribute posIncrAtt = tokenStream.addAttribute(PositionIncrementAttribute.class); final TokenTrackingAttribute trackerAtt = tokenStream.addAttribute(TokenTrackingAttribute.class); // for backwards compatibility, add all "common" attributes tokenStream.addAttribute(OffsetAttribute.class); tokenStream.addAttribute(TypeAttribute.class); try { tokenStream.reset(); int position = 0; while (tokenStream.incrementToken()) { position += posIncrAtt.getPositionIncrement(); trackerAtt.setActPosition(position); tokens.add(tokenStream.cloneAttributes()); } tokenStream.end(); // TODO should we capture? } catch (IOException ioe) { throw new RuntimeException("Error occurred while iterating over tokenstream", ioe); } finally { IOUtils.closeWhileHandlingException(tokenStream); } return tokens; }
Example 8
Source File: QueryBuilder.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Creates complex phrase query from the cached tokenstream contents */ protected Query analyzeMultiPhrase(String field, TokenStream stream, int slop) throws IOException { MultiPhraseQuery.Builder mpqb = newMultiPhraseQueryBuilder(); mpqb.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; List<Term> multiTerms = new ArrayList<>(); stream.reset(); while (stream.incrementToken()) { int positionIncrement = posIncrAtt.getPositionIncrement(); if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpqb.add(multiTerms.toArray(new Term[0]), position); } else { mpqb.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, termAtt.getBytesRef())); } if (enablePositionIncrements) { mpqb.add(multiTerms.toArray(new Term[0]), position); } else { mpqb.add(multiTerms.toArray(new Term[0])); } return mpqb.build(); }
Example 9
Source File: TextFieldMapper.java From crate with Apache License 2.0 | 5 votes |
private static boolean hasGaps(CachingTokenFilter stream) throws IOException { PositionIncrementAttribute posIncAtt = stream.getAttribute(PositionIncrementAttribute.class); stream.reset(); while (stream.incrementToken()) { if (posIncAtt.getPositionIncrement() > 1) { return true; } } return false; }
Example 10
Source File: StringFieldType.java From crate with Apache License 2.0 | 5 votes |
@Override public Query multiPhraseQuery(String field, TokenStream stream, int slop, boolean enablePositionIncrements) throws IOException { MultiPhraseQuery.Builder mpqb = new MultiPhraseQuery.Builder(); mpqb.setSlop(slop); TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAtt = stream.getAttribute(PositionIncrementAttribute.class); int position = -1; List<Term> multiTerms = new ArrayList<>(); stream.reset(); while (stream.incrementToken()) { int positionIncrement = posIncrAtt.getPositionIncrement(); if (positionIncrement > 0 && multiTerms.size() > 0) { if (enablePositionIncrements) { mpqb.add(multiTerms.toArray(new Term[0]), position); } else { mpqb.add(multiTerms.toArray(new Term[0])); } multiTerms.clear(); } position += positionIncrement; multiTerms.add(new Term(field, termAtt.getBytesRef())); } if (enablePositionIncrements) { mpqb.add(multiTerms.toArray(new Term[0]), position); } else { mpqb.add(multiTerms.toArray(new Term[0])); } return mpqb.build(); }
Example 11
Source File: TransportExtendedAnalyzeAction.java From elasticsearch-extended-analyze with Apache License 2.0 | 5 votes |
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes, boolean shortAttrName) { try { stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new ExtendedAnalyzeResponse.ExtendedAnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes, shortAttrName))); } stream.end(); lastOffset += offset.endOffset(); lastPosition += posIncr.getPositionIncrement(); lastPosition += analyzer.getPositionIncrementGap(field); lastOffset += analyzer.getOffsetGap(field); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } finally { IOUtils.closeWhileHandlingException(stream); } }
Example 12
Source File: TokenCountFieldMapper.java From Elasticsearch with Apache License 2.0 | 5 votes |
/** * Count position increments in a token stream. Package private for testing. * @param analyzer analyzer to create token stream * @param fieldName field name to pass to analyzer * @param fieldValue field value to pass to analyzer * @return number of position increments in a token stream * @throws IOException if tokenStream throws it */ static int countPositions(Analyzer analyzer, String fieldName, String fieldValue) throws IOException { try (TokenStream tokenStream = analyzer.tokenStream(fieldName, fieldValue)) { int count = 0; PositionIncrementAttribute position = tokenStream.addAttribute(PositionIncrementAttribute.class); tokenStream.reset(); while (tokenStream.incrementToken()) { count += position.getPositionIncrement(); } tokenStream.end(); count += position.getPositionIncrement(); return count; } }
Example 13
Source File: SynonymMap.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Sugar: analyzes the text with the analyzer and * separates by {@link SynonymMap#WORD_SEPARATOR}. * reuse and its chars must not be null. */ public CharsRef analyze(String text, CharsRefBuilder reuse) throws IOException { try (TokenStream ts = analyzer.tokenStream("", text)) { CharTermAttribute termAtt = ts.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncAtt = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); reuse.clear(); while (ts.incrementToken()) { int length = termAtt.length(); if (length == 0) { throw new IllegalArgumentException("term: " + text + " analyzed to a zero-length token"); } if (posIncAtt.getPositionIncrement() != 1) { throw new IllegalArgumentException("term: " + text + " analyzed to a token (" + termAtt + ") with position increment != 1 (got: " + posIncAtt.getPositionIncrement() + ")"); } reuse.grow(reuse.length() + length + 1); /* current + word + separator */ int end = reuse.length(); if (reuse.length() > 0) { reuse.setCharAt(end++, SynonymMap.WORD_SEPARATOR); reuse.setLength(reuse.length() + 1); } System.arraycopy(termAtt.buffer(), 0, reuse.chars(), end, length); reuse.setLength(reuse.length() + length); } ts.end(); } if (reuse.length() == 0) { throw new IllegalArgumentException("term: " + text + " was completely eliminated by analyzer"); } return reuse.get(); }
Example 14
Source File: TransportAnalyzeAction.java From Elasticsearch with Apache License 2.0 | 5 votes |
private void analyze(TokenStream stream, Analyzer analyzer, String field, Set<String> includeAttributes) { try { stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset +offset.endOffset(), type.type(), extractExtendedAttributes(stream, includeAttributes))); } stream.end(); lastOffset += offset.endOffset(); lastPosition += posIncr.getPositionIncrement(); lastPosition += analyzer.getPositionIncrementGap(field); lastOffset += analyzer.getOffsetGap(field); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } finally { IOUtils.closeWhileHandlingException(stream); } }
Example 15
Source File: TransportAnalyzeAction.java From Elasticsearch with Apache License 2.0 | 5 votes |
private static List<AnalyzeResponse.AnalyzeToken> simpleAnalyze(AnalyzeRequest request, Analyzer analyzer, String field) { List<AnalyzeResponse.AnalyzeToken> tokens = new ArrayList<>(); int lastPosition = -1; int lastOffset = 0; for (String text : request.text()) { try (TokenStream stream = analyzer.tokenStream(field, text)) { stream.reset(); CharTermAttribute term = stream.addAttribute(CharTermAttribute.class); PositionIncrementAttribute posIncr = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offset = stream.addAttribute(OffsetAttribute.class); TypeAttribute type = stream.addAttribute(TypeAttribute.class); while (stream.incrementToken()) { int increment = posIncr.getPositionIncrement(); if (increment > 0) { lastPosition = lastPosition + increment; } tokens.add(new AnalyzeResponse.AnalyzeToken(term.toString(), lastPosition, lastOffset + offset.startOffset(), lastOffset + offset.endOffset(), type.type(), null)); } stream.end(); lastOffset += offset.endOffset(); lastPosition += posIncr.getPositionIncrement(); lastPosition += analyzer.getPositionIncrementGap(field); lastOffset += analyzer.getOffsetGap(field); } catch (IOException e) { throw new ElasticsearchException("failed to analyze", e); } } return tokens; }
Example 16
Source File: GraphTokenStreamFiniteStrings.java From lucene-solr with Apache License 2.0 | 4 votes |
/** * Build an automaton from the provided {@link TokenStream}. */ private Automaton build(final TokenStream in) throws IOException { Automaton.Builder builder = new Automaton.Builder(); final PositionIncrementAttribute posIncAtt = in.addAttribute(PositionIncrementAttribute.class); final PositionLengthAttribute posLengthAtt = in.addAttribute(PositionLengthAttribute.class); in.reset(); int pos = -1; int prevIncr = 1; int state = -1; int id = -1; int gap = 0; while (in.incrementToken()) { int currentIncr = posIncAtt.getPositionIncrement(); if (pos == -1 && currentIncr < 1) { throw new IllegalStateException("Malformed TokenStream, start token can't have increment less than 1"); } if (currentIncr == 0) { if (gap > 0) { pos -= gap; } } else { pos++; gap = currentIncr - 1; } int endPos = pos + posLengthAtt.getPositionLength() + gap; while (state < endPos) { state = builder.createState(); } id++; if (tokens.length < id + 1) { tokens = ArrayUtil.grow(tokens, id + 1); } tokens[id] = in.cloneAttributes(); builder.addTransition(pos, endPos, id); pos += gap; // we always produce linear token graphs from getFiniteStrings(), so we need to adjust // posLength and posIncrement accordingly tokens[id].addAttribute(PositionLengthAttribute.class).setPositionLength(1); if (currentIncr == 0) { // stacked token should have the same increment as original token at this position tokens[id].addAttribute(PositionIncrementAttribute.class).setPositionIncrement(prevIncr); } // only save last increment on non-zero increment in case we have multiple stacked tokens if (currentIncr > 0) { prevIncr = currentIncr; } } in.end(); if (state != -1) { builder.setAccept(state, true); } return builder.finish(); }
Example 17
Source File: TermSubQueryBuilder.java From querqy with Apache License 2.0 | 4 votes |
public TermSubQueryFactory termToFactory(final String fieldname, final Term sourceTerm, final FieldBoost boost) throws IOException { final CacheKey cacheKey; if (termQueryCache != null) { cacheKey = new CacheKey(fieldname, sourceTerm); TermQueryCacheValue cacheValue = termQueryCache.get(cacheKey); if (cacheValue != null) { // The cache references factories with pre-analyzed terms, or cache entries without a // query factory if the term does not exist in the index. cacheValue.hasQuery() returns // true/false correspondingly. // Cache entries don't have a boost factor, it is only added later via the queryFactory. return (cacheValue.hasQuery()) ? new TermSubQueryFactory(cacheValue, boost) : null; } } else { cacheKey = null; } final LuceneQueryFactoryAndPRMSQuery root; TokenStream ts = null; try { ts = analyzer.tokenStream(fieldname, new CharSequenceReader(sourceTerm)); final CharTermAttribute termAttr = ts.addAttribute(CharTermAttribute.class); final PositionIncrementAttribute posIncAttr = ts.addAttribute(PositionIncrementAttribute.class); ts.reset(); final PositionSequence<org.apache.lucene.index.Term> sequence = new PositionSequence<>(); while (ts.incrementToken()) { final int inc = posIncAttr.getPositionIncrement(); if (inc > 0 || sequence.isEmpty()) { sequence.nextPosition(); } sequence.addElement(new org.apache.lucene.index.Term(fieldname, new BytesRef(termAttr))); } root = positionSequenceToQueryFactoryAndPRMS(sequence); } finally { if (ts != null) { try { ts.close(); } catch (IOException e) { } } } putQueryFactoryAndPRMSQueryIntoCache(cacheKey, root); return root == null ? null : new TermSubQueryFactory(root, boost); }
Example 18
Source File: MemoryIndex.java From lucene-solr with Apache License 2.0 | 4 votes |
private void storeTerms(Info info, TokenStream tokenStream, int positionIncrementGap, int offsetGap) { int pos = -1; int offset = 0; if (info.numTokens > 0) { pos = info.lastPosition + positionIncrementGap; offset = info.lastOffset + offsetGap; } try (TokenStream stream = tokenStream) { TermToBytesRefAttribute termAtt = stream.getAttribute(TermToBytesRefAttribute.class); PositionIncrementAttribute posIncrAttribute = stream.addAttribute(PositionIncrementAttribute.class); OffsetAttribute offsetAtt = stream.addAttribute(OffsetAttribute.class); PayloadAttribute payloadAtt = storePayloads ? stream.addAttribute(PayloadAttribute.class) : null; stream.reset(); while (stream.incrementToken()) { // if (DEBUG) System.err.println("token='" + term + "'"); info.numTokens++; final int posIncr = posIncrAttribute.getPositionIncrement(); if (posIncr == 0) { info.numOverlapTokens++; } pos += posIncr; int ord = info.terms.add(termAtt.getBytesRef()); if (ord < 0) { ord = (-ord) - 1; postingsWriter.reset(info.sliceArray.end[ord]); } else { info.sliceArray.start[ord] = postingsWriter.startNewSlice(); } info.sliceArray.freq[ord]++; info.maxTermFrequency = Math.max(info.maxTermFrequency, info.sliceArray.freq[ord]); info.sumTotalTermFreq++; postingsWriter.writeInt(pos); if (storeOffsets) { postingsWriter.writeInt(offsetAtt.startOffset() + offset); postingsWriter.writeInt(offsetAtt.endOffset() + offset); } if (storePayloads) { final BytesRef payload = payloadAtt.getPayload(); final int pIndex; if (payload == null || payload.length == 0) { pIndex = -1; } else { pIndex = payloadsBytesRefs.append(payload); } postingsWriter.writeInt(pIndex); } info.sliceArray.end[ord] = postingsWriter.getCurrentOffset(); } stream.end(); if (info.numTokens > 0) { info.lastPosition = pos; info.lastOffset = offsetAtt.endOffset() + offset; } } catch (IOException e) { throw new RuntimeException(e); } }