Java Code Examples for org.apache.lucene.index.PostingsEnum#nextDoc()
The following examples show how to use
org.apache.lucene.index.PostingsEnum#nextDoc() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * checks docs + freqs + positions + payloads, sequentially */ public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { assertNotNull(leftDocs); assertNotNull(rightDocs); assertEquals(-1, leftDocs.docID()); assertEquals(-1, rightDocs.docID()); int docid; while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(docid, rightDocs.nextDoc()); int freq = leftDocs.freq(); assertEquals(freq, rightDocs.freq()); for (int i = 0; i < freq; i++) { assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); // we don't assert offsets/payloads, they are allowed to be different } } assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); }
Example 2
Source File: TermsIncludingScoreQuery.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException { BytesRef spare = new BytesRef(); PostingsEnum postingsEnum = null; for (int i = 0; i < terms.size(); i++) { if (termsEnum.seekExact(terms.get(ords[i], spare))) { postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); float score = TermsIncludingScoreQuery.this.scores[ords[i]]; for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) { // I prefer this: /*if (scores[doc] < score) { scores[doc] = score; matchingDocs.set(doc); }*/ // But this behaves the same as MVInnerScorer and only then the tests will pass: if (!matchingDocs.get(doc)) { scores[doc] = score; matchingDocs.set(doc); } } } } }
Example 3
Source File: TestRTGBase.java From lucene-solr with Apache License 2.0 | 6 votes |
protected int getFirstMatch(IndexReader r, Term t) throws IOException { Terms terms = MultiTerms.getTerms(r, t.field()); if (terms == null) return -1; BytesRef termBytes = t.bytes(); final TermsEnum termsEnum = terms.iterator(); if (!termsEnum.seekExact(termBytes)) { return -1; } PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE); docs = BitsFilteredPostingsEnum.wrap(docs, MultiBits.getLiveDocs(r)); int id = docs.nextDoc(); if (id != DocIdSetIterator.NO_MORE_DOCS) { int next = docs.nextDoc(); assertEquals(DocIdSetIterator.NO_MORE_DOCS, next); } return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id; }
Example 4
Source File: LuceneUtils.java From semanticvectors with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * Gets the 1 - entropy (i.e. 1+ plogp) of a term, * a function that favors terms that are focally distributed * We use the definition of log-entropy weighting provided in * Martin and Berry (2007): * Entropy = 1 + sum ((Pij log2(Pij)) / log2(n)) * where Pij = frequency of term i in doc j / global frequency of term i * n = number of documents in collection * @param term whose entropy you want * Thanks to Vidya Vasuki for adding the hash table to * eliminate redundant calculation */ private float getEntropy(Term term) { if (termEntropy.containsKey(term.field()+"_"+term.text())) return termEntropy.get(term.field()+"_"+term.text()); int gf = getGlobalTermFreq(term); double entropy = 0; try { PostingsEnum docsEnum = this.getDocsForTerm(term); while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { double p = docsEnum.freq(); //frequency in this document p = p / gf; //frequency across all documents entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P) } int n = this.getNumDocs(); double log2n = Math.log(n) / Math.log(2); entropy = entropy / log2n; } catch (IOException e) { logger.info("Couldn't get term entropy for term " + term.text()); } termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy); return (float) (1 + entropy); }
Example 5
Source File: LukeRequestHandler.java From lucene-solr with Apache License 2.0 | 6 votes |
private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException { PostingsEnum postingsEnum = null; TermsEnum termsEnum = terms.iterator(); BytesRef text; // Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way? for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) { text = termsEnum.next(); if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them. return null; } postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE); final Bits liveDocs = reader.getLiveDocs(); if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { if (liveDocs != null && liveDocs.get(postingsEnum.docID())) { continue; } return reader.document(postingsEnum.docID()); } } return null; }
Example 6
Source File: TermVectorEntry.java From lucene-solr with Apache License 2.0 | 6 votes |
/** * Returns a new term vector entry representing the specified term, and optionally, positions. * * @param te - positioned terms iterator * @return term vector entry * @throws IOException - if there is a low level IO error. */ static TermVectorEntry of(TermsEnum te) throws IOException { Objects.requireNonNull(te); String termText = BytesRefUtils.decode(te.term()); List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>(); PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS); pe.nextDoc(); int freq = pe.freq(); for (int i = 0; i < freq; i++) { int pos = pe.nextPosition(); if (pos < 0) { // no position information available continue; } TermVectorPosition tvPos = TermVectorPosition.of(pos, pe); tvPositions.add(tvPos); } return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions); }
Example 7
Source File: ESIndex.java From pyramid with Apache License 2.0 | 5 votes |
private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException { TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id) .setOffsets(false).setPositions(true).setFieldStatistics(false) .setTermStatistics(false) .setSelectedFields(field). execute().actionGet(); Map<Integer,String> map = new HashMap<>(); Terms terms = response.getFields().terms(field); if (terms==null){ return map; } TermsEnum iterator = terms.iterator(); PostingsEnum postings = null; for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) { String term = termBytes.utf8ToString(); postings = iterator.postings(postings, PostingsEnum.ALL); //there can only be one doc since we are getting with id. get the doc and the position postings.nextDoc(); int tf = postings.freq(); for (int i = 0; i < tf; i++) { int pos = postings.nextPosition(); map.put(pos,term); } } return map; }
Example 8
Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * checks advancing docs */ public void assertDocsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { if (leftDocs == null) { assertNull(rightDocs); return; } int docid = -1; int averageGap = MAXDOC / (1+docFreq); int skipInterval = 16; while (true) { if (random().nextBoolean()) { // nextDoc() docid = leftDocs.nextDoc(); assertEquals(docid, rightDocs.nextDoc()); } else { // advance() int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap)); docid = leftDocs.advance(skip); assertEquals(docid, rightDocs.advance(skip)); } if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } // we don't assert freqs, they are allowed to be different } }
Example 9
Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * checks docs + freqs, sequentially */ public void assertDocsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { if (leftDocs == null) { assertNull(rightDocs); return; } assertEquals(-1, leftDocs.docID()); assertEquals(-1, rightDocs.docID()); int docid; while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) { assertEquals(docid, rightDocs.nextDoc()); // we don't assert freqs, they are allowed to be different } assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc()); }
Example 10
Source File: TermPrefixCursor.java From SolrTextTagger with Apache License 2.0 | 5 votes |
/** Returns an IntsRef either cached or reading postingsEnum. Not null. * @param postingsEnum*/ private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException { // (The cache can have empty IntsRefs) //lookup prefixBuf in a cache if (docIdsCache != null) { docIds = docIdsCache.get(prefixBuf); if (docIds != null) { return docIds; } } //read postingsEnum docIds = new IntsRef(termsEnum.docFreq()); int docId; while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) { continue; } docIds.ints[docIds.length++] = docId; } if (docIds.length == 0) docIds = EMPTY_INTSREF; //cache if (docIdsCache != null) { ensureBufIsACopy(); //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to docIdsCache.put(prefixBuf.clone(), docIds); } return docIds; }
Example 11
Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException { List<MWESentenceContext> result = new ArrayList<>(); TermsEnum tiRef= termVectorLookup.iterator(); BytesRef luceneTerm = tiRef.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = tiRef.next(); continue; } String tString = luceneTerm.utf8ToString(); if(!allCandidates.contains(tString)) { luceneTerm=tiRef.next(); continue; } PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL); //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS); int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV if (doc != PostingsEnum.NO_MORE_DOCS) { int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { postingsEnum.nextPosition(); int start = postingsEnum.startOffset(); int end = postingsEnum.endOffset(); BytesRef payload=postingsEnum.getPayload(); int sentenceId=-1; if(payload!=null){ sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId(); } result.add(new MWESentenceContext(tString,sentenceId, start, end)); } } luceneTerm = tiRef.next(); } Collections.sort(result); return result; }
Example 12
Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * checks advancing docs + positions */ public void assertPositionsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception { if (leftDocs == null || rightDocs == null) { assertNull(leftDocs); assertNull(rightDocs); return; } int docid = -1; int averageGap = MAXDOC / (1+docFreq); int skipInterval = 16; while (true) { if (random().nextBoolean()) { // nextDoc() docid = leftDocs.nextDoc(); assertEquals(docid, rightDocs.nextDoc()); } else { // advance() int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap)); docid = leftDocs.advance(skip); assertEquals(docid, rightDocs.advance(skip)); } if (docid == DocIdSetIterator.NO_MORE_DOCS) { return; } int freq = leftDocs.freq(); assertEquals(freq, rightDocs.freq()); for (int i = 0; i < freq; i++) { assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition()); // we don't compare the payloads, it's allowed that one is empty etc } } }
Example 13
Source File: AlfrescoLukeRequestHandler.java From SearchServices with GNU Lesser General Public License v3.0 | 5 votes |
protected static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException { TermsEnum termsEnum = terms.iterator(); if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them. return null; } PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE); final Bits liveDocs = reader.getLiveDocs(); if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS || (liveDocs != null && liveDocs.get(postingsEnum.docID()))) { return null; } return reader.document(postingsEnum.docID()); }
Example 14
Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected Accountable createValue(LeafReader reader, CacheKey key) throws IOException { final int maxDoc = reader.maxDoc(); Terms terms = reader.terms(key.field); final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); final PagedBytes bytes = new PagedBytes(15); int startTermsBPV; // TODO: use Uninvert? if (terms != null) { // Try for coarse estimate for number of bits; this // should be an underestimate most of the time, which // is fine -- GrowableWriter will reallocate as needed long numUniqueTerms = terms.size(); if (numUniqueTerms != -1L) { if (numUniqueTerms > maxDoc) { throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead"); } startTermsBPV = PackedInts.bitsRequired(numUniqueTerms); } else { startTermsBPV = 1; } } else { startTermsBPV = 1; } PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT); final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio); int termOrd = 0; // TODO: use Uninvert? if (terms != null) { final TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while(true) { final BytesRef term = termsEnum.next(); if (term == null) { break; } if (termOrd >= maxDoc) { throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead"); } termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term)); docs = termsEnum.postings(docs, PostingsEnum.NONE); while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } // Store 1+ ord into packed bits docToTermOrd.set(docID, 1+termOrd); } termOrd++; } } // maybe an int-only impl? return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd); }
Example 15
Source File: DirectoryTaxonomyReader.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public int getOrdinal(FacetLabel cp) throws IOException { ensureOpen(); if (cp.length == 0) { return ROOT_ORDINAL; } // First try to find the answer in the LRU cache: synchronized (ordinalCache) { Integer res = ordinalCache.get(cp); if (res != null) { if (res.intValue() < indexReader.maxDoc()) { // Since the cache is shared with DTR instances allocated from // doOpenIfChanged, we need to ensure that the ordinal is one that // this DTR instance recognizes. return res.intValue(); } else { // if we get here, it means that the category was found in the cache, // but is not recognized by this TR instance. Therefore there's no // need to continue search for the path on disk, because we won't find // it there too. return TaxonomyReader.INVALID_ORDINAL; } } } // If we're still here, we have a cache miss. We need to fetch the // value from disk, and then also put it in the cache: int ret = TaxonomyReader.INVALID_ORDINAL; PostingsEnum docs = MultiTerms.getTermPostingsEnum(indexReader, Consts.FULL, new BytesRef(FacetsConfig.pathToString(cp.components, cp.length)), 0); if (docs != null && docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { ret = docs.docID(); // we only store the fact that a category exists, not its inexistence. // This is required because the caches are shared with new DTR instances // that are allocated from doOpenIfChanged. Therefore, if we only store // information about found categories, we cannot accidently tell a new // generation of DTR that a category does not exist. synchronized (ordinalCache) { ordinalCache.put(cp, Integer.valueOf(ret)); } } return ret; }
Example 16
Source File: DocVectors.java From semanticvectors with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * Creates doc vectors, iterating over terms. */ private void trainDocVectors() throws IOException { VerbatimLogger.info("Building document vectors ... "); Enumeration<ObjectVector> termEnum = termVectors.getAllVectors(); try { int tc = 0; while (termEnum.hasMoreElements()) { // Output progress counter. if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) { VerbatimLogger.info("Processed " + tc + " terms ... "); } tc++; ObjectVector termVectorObject = termEnum.nextElement(); Vector termVector = termVectorObject.getVector(); String word = (String) termVectorObject.getObject(); // Go through checking terms for each fieldName. for (String fieldName : flagConfig.contentsfields()) { Term term = new Term(fieldName, word); float globalweight = luceneUtils.getGlobalTermWeight(term); float fieldweight = 1; // Get any docs for this term. PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term); // This may occur frequently if one term vector store is derived from multiple fields if (docsEnum == null) { continue; } while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID()); // Add vector from this term, taking freq into account. Vector docVector = this.docVectors.getVector(externalDocID); float localweight = docsEnum.freq(); if (flagConfig.fieldweight()) { //field weight: 1/sqrt(number of terms in field) TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator(); int numTerms = 0; while (terms.next() != null) { numTerms++; } fieldweight = (float) (1/Math.sqrt(numTerms)); } docVector.superpose( termVector, localweight * globalweight * fieldweight, null); } } } } catch (IOException e) { // catches from indexReader. e.printStackTrace(); } VerbatimLogger.info("\nNormalizing doc vectors ...\n"); Enumeration<ObjectVector> docEnum = docVectors.getAllVectors(); while (docEnum.hasMoreElements()) docEnum.nextElement().getVector().normalize(); }
Example 17
Source File: VectorScoreQuery.java From solr-vector-scoring with Apache License 2.0 | 4 votes |
@Override protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException { return new CustomScoreProvider(context){ @Override public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException { float score = 0; double docVectorNorm = 0; LeafReader reader = context.reader(); Terms terms = reader.getTermVector(docID, field); if(vector.size() != terms.size()){ throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length"); } TermsEnum iter = terms.iterator(); BytesRef text; while ((text = iter.next()) != null) { String term = text.utf8ToString(); float payloadValue = 0f; PostingsEnum postings = iter.postings(null, PostingsEnum.ALL); while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) { int freq = postings.freq(); while (freq-- > 0) postings.nextPosition(); BytesRef payload = postings.getPayload(); payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); if (cosine) docVectorNorm += Math.pow(payloadValue, 2.0); } score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term)))); } if (cosine) { if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f; return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm))); } return score; } }; }
Example 18
Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override protected Accountable createValue(LeafReader reader, CacheKey key) throws IOException { // TODO: would be nice to first check if DocTermsIndex // was already cached for this field and then return // that instead, to avoid insanity final int maxDoc = reader.maxDoc(); Terms terms = reader.terms(key.field); final float acceptableOverheadRatio = ((Float) key.custom).floatValue(); final int termCountHardLimit = maxDoc; // Holds the actual term data, expanded. final PagedBytes bytes = new PagedBytes(15); int startBPV; if (terms != null) { // Try for coarse estimate for number of bits; this // should be an underestimate most of the time, which // is fine -- GrowableWriter will reallocate as needed long numUniqueTerms = terms.size(); if (numUniqueTerms != -1L) { if (numUniqueTerms > termCountHardLimit) { numUniqueTerms = termCountHardLimit; } startBPV = PackedInts.bitsRequired(numUniqueTerms*4); } else { startBPV = 1; } } else { startBPV = 1; } final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio); // pointer==0 means not set bytes.copyUsingLengthPrefix(new BytesRef()); if (terms != null) { int termCount = 0; final TermsEnum termsEnum = terms.iterator(); PostingsEnum docs = null; while(true) { if (termCount++ == termCountHardLimit) { // app is misusing the API (there is more than // one term per doc); in this case we make best // effort to load what we can (see LUCENE-2142) break; } final BytesRef term = termsEnum.next(); if (term == null) { break; } final long pointer = bytes.copyUsingLengthPrefix(term); docs = termsEnum.postings(docs, PostingsEnum.NONE); while (true) { final int docID = docs.nextDoc(); if (docID == DocIdSetIterator.NO_MORE_DOCS) { break; } docToOffset.set(docID, pointer); } } } final PackedInts.Reader offsetReader = docToOffset.getMutable(); Bits docsWithField = new Bits() { @Override public boolean get(int index) { return offsetReader.get(index) != 0; } @Override public int length() { return maxDoc; } }; wrapper.setDocsWithField(reader, key.field, docsWithField, null); // maybe an int-only impl? return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField); }
Example 19
Source File: FilterableTermsEnum.java From Elasticsearch with Apache License 2.0 | 4 votes |
@Override public boolean seekExact(BytesRef text) throws IOException { int docFreq = 0; long totalTermFreq = 0; for (Holder anEnum : enums) { if (anEnum.termsEnum.seekExact(text)) { if (anEnum.bits == null) { docFreq += anEnum.termsEnum.docFreq(); if (docsEnumFlag == PostingsEnum.FREQS) { long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq(); if (totalTermFreq == -1 || leafTotalTermFreq == -1) { totalTermFreq = -1; continue; } totalTermFreq += leafTotalTermFreq; } } else { final PostingsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.postings(anEnum.docsEnum, docsEnumFlag); // 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not if (docsEnumFlag == PostingsEnum.FREQS) { for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { if (anEnum.bits != null && anEnum.bits.get(docId) == false) { continue; } docFreq++; // docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value // is really 1 or unrecorded when filtering like this totalTermFreq += docsEnum.freq(); } } else { for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) { if (anEnum.bits != null && anEnum.bits.get(docId) == false) { continue; } // docsEnum.freq() behaviour is undefined if docsEnumFlag==PostingsEnum.FLAG_NONE so don't bother with call docFreq++; } } } } } if (docFreq > 0) { currentDocFreq = docFreq; currentTotalTermFreq = totalTermFreq; current = text; return true; } else { currentDocFreq = NOT_FOUND; currentTotalTermFreq = NOT_FOUND; current = null; return false; } }
Example 20
Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0 | 4 votes |
private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException { NamedList<Object> fieldNL = new NamedList<>(); docNL.add(field, fieldNL); BytesRef text; PostingsEnum dpEnum = null; while((text = termsEnum.next()) != null) { String term = text.utf8ToString(); NamedList<Object> termInfo = new NamedList<>(); fieldNL.add(term, termInfo); final int freq = (int) termsEnum.totalTermFreq(); if (fieldOptions.termFreq == true) { termInfo.add("tf", freq); } int dpEnumFlags = 0; dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0; //payloads require offsets dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0; dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0; dpEnum = termsEnum.postings(dpEnum, dpEnumFlags); boolean atNextDoc = false; if (dpEnum != null) { dpEnum.nextDoc(); atNextDoc = true; } if (atNextDoc && dpEnumFlags != 0) { NamedList<Integer> positionsNL = null; NamedList<Number> theOffsets = null; NamedList<String> thePayloads = null; for (int i = 0; i < freq; i++) { final int pos = dpEnum.nextPosition(); if (fieldOptions.positions && pos >= 0) { if (positionsNL == null) { positionsNL = new NamedList<>(); termInfo.add("positions", positionsNL); } positionsNL.add("position", pos); } int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1; if (startOffset >= 0) { if (theOffsets == null) { theOffsets = new NamedList<>(); termInfo.add("offsets", theOffsets); } theOffsets.add("start", dpEnum.startOffset()); theOffsets.add("end", dpEnum.endOffset()); } BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null; if (payload != null) { if (thePayloads == null) { thePayloads = new NamedList<>(); termInfo.add("payloads", thePayloads); } thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length)); } } } int df = 0; if (fieldOptions.docFreq || fieldOptions.tfIdf) { df = reader.docFreq(new Term(field, text)); } if (fieldOptions.docFreq) { termInfo.add("df", df); } // TODO: this is not TF/IDF by anyone's definition! if (fieldOptions.tfIdf) { double tfIdfVal = ((double) freq) / df; termInfo.add("tf-idf", tfIdfVal); } } }