Java Code Examples for org.apache.lucene.index.PostingsEnum#nextPosition()
The following examples show how to use
org.apache.lucene.index.PostingsEnum#nextPosition() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0 | 6 votes |
private void initValues(Terms curTerms, PostingsEnum posEnum, int termFreq) throws IOException { for (int j = 0; j < termFreq; j++) { int nextPos = posEnum.nextPosition(); if (curTerms.hasPositions()) { currentPositions[j] = nextPos; } if (curTerms.hasOffsets()) { currentStartOffset[j] = posEnum.startOffset(); currentEndOffset[j] = posEnum.endOffset(); } if (curTerms.hasPayloads()) { BytesRef curPayload = posEnum.getPayload(); if (curPayload != null) { currentPayloads[j] = new BytesArray(curPayload.bytes, 0, curPayload.length); } else { currentPayloads[j] = null; } } } }
Example 2
Source File: TermVectorEntry.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Returns a new term vector entry representing the specified term, and optionally, positions. * * @param te - positioned terms iterator * @return term vector entry * @throws IOException - if there is a low level IO error. */ static TermVectorEntry of(TermsEnum te) throws IOException { Objects.requireNonNull(te); String termText = BytesRefUtils.decode(te.term()); List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>(); PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); pe.nextDoc(); int freq = pe.freq(); for (int i = 0; i < freq; i++) { int pos = pe.nextPosition(); if (pos < 0) { // no position information available continue; } TermVectorPosition tvPos = TermVectorPosition.of(pos, pe); tvPositions.add(tvPos); } return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions); }
Example 3
Source File: BlendedInfixSuggester.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Create the coefficient to transform the weight. * * @param doc id of the document * @param matchedTokens tokens found in the query * @param prefixToken unfinished token in the query * @return the coefficient * @throws IOException If there are problems reading term vectors from the underlying Lucene index. */ private double createCoefficient(IndexSearcher searcher, int doc, Set<String> matchedTokens, String prefixToken) throws IOException { Terms tv = searcher.getIndexReader().getTermVector(doc, TEXT_FIELD_NAME); TermsEnum it = tv.iterator(); Integer position = Integer.MAX_VALUE; BytesRef term; // find the closest token position while ((term = it.next()) != null) { String docTerm = term.utf8ToString(); if (matchedTokens.contains(docTerm) || (prefixToken != null && docTerm.startsWith(prefixToken))) { PostingsEnum docPosEnum = it.postings(null, PostingsEnum.OFFSETS); docPosEnum.nextDoc(); // use the first occurrence of the term int p = docPosEnum.nextPosition(); if (p < position) { position = p; } } } // create corresponding coefficient based on position return calculateCoefficient(position); }
Example 4
Source File: TaxonomyIndexArrays.java From lucene-solr with Apache License 2.0 | 5 votes |
private void initParents(IndexReader reader, int first) throws IOException { if (reader.maxDoc() == first) { return; } // it's ok to use MultiTerms because we only iterate on one posting list. // breaking it to loop over the leaves() only complicates code for no // apparent gain. PostingsEnum positions = MultiTerms.getTermPostingsEnum(reader, Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF, PostingsEnum.PAYLOADS); // shouldn't really happen, if it does, something's wrong if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) { throw new CorruptIndexException("Missing parent data for category " + first, reader.toString()); } int num = reader.maxDoc(); for (int i = first; i < num; i++) { if (positions.docID() == i) { if (positions.freq() == 0) { // shouldn't happen throw new CorruptIndexException("Missing parent data for category " + i, reader.toString()); } parents[i] = positions.nextPosition(); if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) { if (i + 1 < num) { throw new CorruptIndexException("Missing parent data for category "+ (i + 1), reader.toString()); } break; } } else { // this shouldn't happen throw new CorruptIndexException("Missing parent data for category " + i, reader.toString()); } } }
Example 5
Source File: TestTeeSinkTokenFilter.java From lucene-solr with Apache License 2.0 | 5 votes |
public void testEndOffsetPositionWithTeeSinkTokenFilter() throws Exception { Directory dir = newDirectory(); Analyzer analyzer = new MockAnalyzer(random(), MockTokenizer.WHITESPACE, false); IndexWriter w = new IndexWriter(dir, newIndexWriterConfig(analyzer)); Document doc = new Document(); TokenStream tokenStream = analyzer.tokenStream("field", "abcd "); TeeSinkTokenFilter tee = new TeeSinkTokenFilter(tokenStream); TokenStream sink = tee.newSinkTokenStream(); FieldType ft = new FieldType(TextField.TYPE_NOT_STORED); ft.setStoreTermVectors(true); ft.setStoreTermVectorOffsets(true); ft.setStoreTermVectorPositions(true); Field f1 = new Field("field", tee, ft); Field f2 = new Field("field", sink, ft); doc.add(f1); doc.add(f2); w.addDocument(doc); w.close(); IndexReader r = DirectoryReader.open(dir); Terms vector = r.getTermVectors(0).terms("field"); assertEquals(1, vector.size()); TermsEnum termsEnum = vector.iterator(); termsEnum.next(); assertEquals(2, termsEnum.totalTermFreq()); PostingsEnum positions = termsEnum.postings(null, PostingsEnum.ALL); assertTrue(positions.nextDoc() != DocIdSetIterator.NO_MORE_DOCS); assertEquals(2, positions.freq()); positions.nextPosition(); assertEquals(0, positions.startOffset()); assertEquals(4, positions.endOffset()); positions.nextPosition(); assertEquals(8, positions.startOffset()); assertEquals(12, positions.endOffset()); assertEquals(DocIdSetIterator.NO_MORE_DOCS, positions.nextDoc()); r.close(); dir.close(); analyzer.close(); }
Example 6
Source File: ESIndex.java From pyramid with Apache License 2.0 | 5 votes |
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException { TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id) .setOffsets(false).setPositions(true).setFieldStatistics(false) .setTermStatistics(false) .setSelectedFields(field). execute().actionGet(); Map<Integer,String> map = new HashMap<>(); Terms terms = response.getFields().terms(field); if (terms==null){ return map; } TermsEnum iterator = terms.iterator(); PostingsEnum postings = null; for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) { String term = termBytes.utf8ToString(); postings = iterator.postings(postings, PostingsEnum.ALL); //there can only be one doc since we are getting with id. get the doc and the position postings.nextDoc(); int tf = postings.freq(); for (int i = 0; i < tf; i++) { int pos = postings.nextPosition(); map.put(pos,term); } } return map; }
Example 7
Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException { List<MWESentenceContext> result = new ArrayList<>(); TermsEnum tiRef= termVectorLookup.iterator(); BytesRef luceneTerm = tiRef.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = tiRef.next(); continue; } String tString = luceneTerm.utf8ToString(); if(!allCandidates.contains(tString)) { luceneTerm=tiRef.next(); continue; } PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL); //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS); int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV if (doc != PostingsEnum.NO_MORE_DOCS) { int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { postingsEnum.nextPosition(); int start = postingsEnum.startOffset(); int end = postingsEnum.endOffset(); BytesRef payload=postingsEnum.getPayload(); int sentenceId=-1; if(payload!=null){ sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId(); } result.add(new MWESentenceContext(tString,sentenceId, start, end)); } } luceneTerm = tiRef.next(); } Collections.sort(result); return result; }
Example 8
Source File: WindowBuildingTVM.java From wiseowl with MIT License | 4 votes |
public void map(Terms terms,Spans spans) throws IOException { int primStart = spanStart - primaryWS; int primEnd = spanEnd + primaryWS; // stores the start and end of the adjacent previous and following int adjLBStart = primStart - adjWS; int adjLBEnd = primStart - 1;//don't overlap int adjUBStart = primEnd + 1;//don't overlap int adjUBEnd = primEnd + adjWS; //stores the start and end of the secondary previous and the secondary following int secLBStart = adjLBStart - secWS; int secLBEnd = adjLBStart - 1; //don't overlap the adjacent window int secUBStart = adjUBEnd + 1; int secUBEnd = adjUBEnd + secWS; WindowTerm lastWT = null; if(terms!=null) {} TermsEnum termsEnum = terms.iterator(); BytesRef termref = null; String term=null; while ((termref = termsEnum.next()) != null) { term=termsEnum.term().utf8ToString(); PostingsEnum postings = termsEnum.postings(null, PostingsEnum.PAYLOADS | PostingsEnum.OFFSETS); postings.nextDoc(); if (term.startsWith(NameFilter.NE_PREFIX) == false && term.startsWith(PassageRankingComponent.NE_PREFIX_LOWER) == false) {//filter out the types, as we don't need them here //construct the windows, which means we need a bunch of //bracketing variables to know what window we are in //start and end of the primary window //unfortunately, we still have to loop over the positions //we'll make this inclusive of the boundaries, do an upfront check here so //we can skip over anything that is outside of all windows //int position=spans.nextStartPosition(); int position=postings.nextPosition(); if (position >= secLBStart && position <= secUBEnd) { //fill in the windows WindowTerm wt; //offsets aren't required, but they are nice to have if (postings != null){ //log.warn("terms if postings!=null {}",term); wt = new WindowTerm(term, position, postings.startOffset(), postings.endOffset()); } else { wt = new WindowTerm(term, position); //log.warn("terms if postings==null {}",term); } if (position >= primStart && position <= primEnd) {//are we in the primary window passage.terms.add(wt); //we are only going to keep bigrams for the primary window. You could do it for the other windows, too if (lastWT != null) { WindowTerm bigramWT = new WindowTerm(lastWT.term + "," + term, lastWT.position);//we don't care about offsets for bigrams passage.bigrams.add(bigramWT); } lastWT = wt; } else if (position >= secLBStart && position <= secLBEnd) { //are we in the secondary previous window? passage.secPrevTerms.add(wt); } else if (position >= secUBStart && position <= secUBEnd) {//are we in the secondary following window? passage.secFollowTerms.add(wt); } else if (position >= adjLBStart && position <= adjLBEnd) {//are we in the adjacent previous window? passage.prevTerms.add(wt); } else if (position >= adjUBStart && position <= adjUBEnd) {//are we in the adjacent following window? passage.followTerms.add(wt); } } //} }} }
Example 9
Source File: VectorScoreQuery.java From solr-vector-scoring with Apache License 2.0 | 4 votes |
@Override protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException { return new CustomScoreProvider(context){ @Override public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException { float score = 0; double docVectorNorm = 0; LeafReader reader = context.reader(); Terms terms = reader.getTermVector(docID, field); if(vector.size() != terms.size()){ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length"); } TermsEnum iter = terms.iterator(); BytesRef text; while ((text = iter.next()) != null) { String term = text.utf8ToString(); float payloadValue = 0f; PostingsEnum postings = iter.postings(null, PostingsEnum.ALL); while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int freq = postings.freq(); while (freq-- > 0) postings.nextPosition(); BytesRef payload = postings.getPayload(); payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); if (cosine) docVectorNorm += Math.pow(payloadValue, 2.0); } score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term)))); } if (cosine) { if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f; return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm))); } return score; } }; }
Example 10
Source File: PayloadFilteredTermIntervalsSource.java From lucene-solr with Apache License 2.0 | 4 votes |
private IntervalMatchesIterator matches(TermsEnum te, int doc) throws IOException { PostingsEnum pe = te.postings(null, PostingsEnum.ALL); if (pe.advance(doc) != doc) { return null; } return new IntervalMatchesIterator() { @Override public int gaps() { return 0; } @Override public int width() { return 1; } int upto = pe.freq(); int pos = -1; @Override public boolean next() throws IOException { do { if (upto <= 0) { pos = IntervalIterator.NO_MORE_INTERVALS; return false; } upto--; pos = pe.nextPosition(); } while (filter.test(pe.getPayload()) == false); return true; } @Override public int startPosition() { return pos; } @Override public int endPosition() { return pos; } @Override public int startOffset() throws IOException { return pe.startOffset(); } @Override public int endOffset() throws IOException { return pe.endOffset(); } @Override public MatchesIterator getSubMatches() { return null; } @Override public Query getQuery() { throw new UnsupportedOperationException(); } }; }
Example 11
Source File: TermIntervalsSource.java From lucene-solr with Apache License 2.0 | 4 votes |
static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException { TermQuery query = new TermQuery(new Term(field, te.term())); PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); if (pe.advance(doc) != doc) { return null; } return new IntervalMatchesIterator() { @Override public int gaps() { return 0; } @Override public int width() { return 1; } int upto = pe.freq(); int pos = -1; @Override public boolean next() throws IOException { if (upto <= 0) { pos = IntervalIterator.NO_MORE_INTERVALS; return false; } upto--; pos = pe.nextPosition(); return true; } @Override public int startPosition() { return pos; } @Override public int endPosition() { return pos; } @Override public int startOffset() throws IOException { return pe.startOffset(); } @Override public int endOffset() throws IOException { return pe.endOffset(); } @Override public MatchesIterator getSubMatches() { return null; } @Override public Query getQuery() { return query; } }; }
Example 12
Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0 | 4 votes |
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException { NamedList<Object> fieldNL = new NamedList<>(); docNL.add(field, fieldNL); BytesRef text; PostingsEnum dpEnum = null; while((text = termsEnum.next()) != null) { String term = text.utf8ToString(); NamedList<Object> termInfo = new NamedList<>(); fieldNL.add(term, termInfo); final int freq = (int) termsEnum.totalTermFreq(); if (fieldOptions.termFreq == true) { termInfo.add("tf", freq); } int dpEnumFlags = 0; dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0; //payloads require offsets dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0; dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0; dpEnum = termsEnum.postings(dpEnum, dpEnumFlags); boolean atNextDoc = false; if (dpEnum != null) { dpEnum.nextDoc(); atNextDoc = true; } if (atNextDoc && dpEnumFlags != 0) { NamedList<Integer> positionsNL = null; NamedList<Number> theOffsets = null; NamedList<String> thePayloads = null; for (int i = 0; i < freq; i++) { final int pos = dpEnum.nextPosition(); if (fieldOptions.positions && pos >= 0) { if (positionsNL == null) { positionsNL = new NamedList<>(); termInfo.add("positions", positionsNL); } positionsNL.add("position", pos); } int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1; if (startOffset >= 0) { if (theOffsets == null) { theOffsets = new NamedList<>(); termInfo.add("offsets", theOffsets); } theOffsets.add("start", dpEnum.startOffset()); theOffsets.add("end", dpEnum.endOffset()); } BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null; if (payload != null) { if (thePayloads == null) { thePayloads = new NamedList<>(); termInfo.add("payloads", thePayloads); } thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length)); } } } int df = 0; if (fieldOptions.docFreq || fieldOptions.tfIdf) { df = reader.docFreq(new Term(field, text)); } if (fieldOptions.docFreq) { termInfo.add("df", df); } // TODO: this is not TF/IDF by anyone's definition! if (fieldOptions.tfIdf) { double tfIdfVal = ((double) freq) / df; termInfo.add("tf-idf", tfIdfVal); } } }
Example 13
Source File: FrequencyCtxWindowBasedFBWorker.java From jate with GNU Lesser General Public License v3.0 | 4 votes |
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup, Map<Integer, Integer> sentenceBoundaries) throws IOException { List<MWEInSentence> result = new ArrayList<>(); TermsEnum tiRef = termVectorLookup.iterator(); BytesRef luceneTerm = tiRef.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = tiRef.next(); continue; } String tString = luceneTerm.utf8ToString(); if (!allCandidates.contains(tString)) { luceneTerm = tiRef.next(); continue; } PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL); //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS); int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV if (doc != PostingsEnum.NO_MORE_DOCS) { int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { postingsEnum.nextPosition(); int start = postingsEnum.startOffset(); int end = postingsEnum.endOffset(); BytesRef payload = postingsEnum.getPayload(); SentenceContext sentenceContextInfo = null; if (payload != null) { sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())); } if (sentenceContextInfo == null) result.add(new MWEInSentence(tString, start, end, 0, 0, 0)); else { result.add(new MWEInSentence(tString, start, end, sentenceContextInfo.getFirstTokenIdx(), sentenceContextInfo.getLastTokenIdx(), sentenceContextInfo.getSentenceId())); Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId()); if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx()) sentenceBoundaries.put(sentenceContextInfo.getSentenceId(), sentenceContextInfo.getLastTokenIdx()); } } } luceneTerm = tiRef.next(); } Collections.sort(result); return result; }