Java Code Examples for org.apache.lucene.index.PostingsEnum#NO_MORE_DOCS
The following examples show how to use
org.apache.lucene.index.PostingsEnum#NO_MORE_DOCS .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: FieldFeatureTFExtractorFactory.java From ltr4l with Apache License 2.0 | 6 votes |
@Override public FieldFeatureExtractor[] create(LeafReaderContext context, Set<Integer> allDocs) throws IOException { FieldFeatureExtractor[] extractors = new FieldFeatureExtractor[terms.length]; int i = 0; for(Term term: terms){ final TermsEnum termsEnum = getTermsEnum(context, term); if (termsEnum == null) { extractors[i] = new FieldFeatureNullExtractor(); } else{ extractors[i] = new FieldFeatureTFExtractor(termsEnum.postings(null, PostingsEnum.FREQS)); // get it twice without reuse to clone it... PostingsEnum docs = termsEnum.postings(null, PostingsEnum.FREQS); for(int docId = docs.nextDoc(); docId != PostingsEnum.NO_MORE_DOCS; docId = docs.nextDoc()){ allDocs.add(docId); } } i++; } return extractors; }
Example 2
Source File: LuceneIndex.java From rdf4j with BSD 3-Clause "New" or "Revised" License | 6 votes |
private static Document getDocument(LeafReader reader, Term term) throws IOException { PostingsEnum docs = reader.postings(term); if (docs != null) { int docId = docs.nextDoc(); // PostingsEnum may contain deleted documents, we have to cope for it while (docId != PostingsEnum.NO_MORE_DOCS) { // if document is deleted, skip and continue Bits liveDocs = reader.getLiveDocs(); if (liveDocs != null && !liveDocs.get(docId)) { docId = docs.nextDoc(); continue; } if (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) { throw new IllegalStateException("Multiple Documents for term " + term.text()); } return readDocument(reader, docId, null); } } return null; }
Example 3
Source File: DocumentsImpl.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public Optional<Integer> firstTermDoc() { if (tenum == null) { // terms enum is not set log.warn("Terms enum un-positioned."); return Optional.empty(); } try { setPostingsIterator(tenum.postings(penum, PostingsEnum.ALL)); if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) { // no docs available for this term resetPostingsIterator(); log.warn("No docs available for term: {} in field: {}.", BytesRefUtils.decode(tenum.term()), curField); return Optional.empty(); } else { return Optional.of(penum.docID()); } } catch (IOException e) { resetPostingsIterator(); throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e); } }
Example 4
Source File: DocumentsImpl.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public Optional<Integer> nextTermDoc() { if (penum == null) { // postings enum is not initialized log.warn("Postings enum un-positioned for field: {}.", curField); return Optional.empty(); } try { if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) { // end of the iterator resetPostingsIterator(); if (log.isInfoEnabled()) { log.info("Reached the end of the postings iterator for term: {} in field: {}", BytesRefUtils.decode(tenum.term()), curField); } return Optional.empty(); } else { return Optional.of(penum.docID()); } } catch (IOException e) { resetPostingsIterator(); throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e); } }
Example 5
Source File: TestIDVersionPostingsFormat.java From lucene-solr with Apache License 2.0 | 6 votes |
/** Returns docID if found, else -1. */ public int lookup(BytesRef id, long version) throws IOException { for(int seg=0;seg<numSegs;seg++) { if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) { if (VERBOSE) { System.out.println(" found in seg=" + termsEnums[seg]); } postingsEnums[seg] = termsEnums[seg].postings(postingsEnums[seg], 0); int docID = postingsEnums[seg].nextDoc(); if (docID != PostingsEnum.NO_MORE_DOCS && (liveDocs[seg] == null || liveDocs[seg].get(docID))) { lastVersion = ((IDVersionSegmentTermsEnum) termsEnums[seg]).getVersion(); return docBases[seg] + docID; } assert hasDeletions; } } return -1; }
Example 6
Source File: LuceneUtils.java From semanticvectors with BSD 3-Clause "New" or "Revised" License | 6 votes |
/** * Gets the 1 - entropy (i.e. 1+ plogp) of a term, * a function that favors terms that are focally distributed * We use the definition of log-entropy weighting provided in * Martin and Berry (2007): * Entropy = 1 + sum ((Pij log2(Pij)) / log2(n)) * where Pij = frequency of term i in doc j / global frequency of term i * n = number of documents in collection * @param term whose entropy you want * Thanks to Vidya Vasuki for adding the hash table to * eliminate redundant calculation */ private float getEntropy(Term term) { if (termEntropy.containsKey(term.field()+"_"+term.text())) return termEntropy.get(term.field()+"_"+term.text()); int gf = getGlobalTermFreq(term); double entropy = 0; try { PostingsEnum docsEnum = this.getDocsForTerm(term); while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { double p = docsEnum.freq(); //frequency in this document p = p / gf; //frequency across all documents entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P) } int n = this.getNumDocs(); double log2n = Math.log(n) / Math.log(2); entropy = entropy / log2n; } catch (IOException e) { logger.info("Couldn't get term entropy for term " + term.text()); } termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy); return (float) (1 + entropy); }
Example 7
Source File: LuceneIndex.java From rdf4j with BSD 3-Clause "New" or "Revised" License | 5 votes |
private static void addDocuments(LeafReader reader, Term term, Collection<Document> documents) throws IOException { PostingsEnum docs = reader.postings(term); if (docs != null) { int docId; while ((docId = docs.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { Bits liveDocs = reader.getLiveDocs(); // Maybe some of the docs have been deleted! Check that too.. if (liveDocs != null && !liveDocs.get(docId)) { continue; } Document document = readDocument(reader, docId, null); documents.add(document); } } }
Example 8
Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0 | 5 votes |
private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException { List<MWESentenceContext> result = new ArrayList<>(); TermsEnum tiRef= termVectorLookup.iterator(); BytesRef luceneTerm = tiRef.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = tiRef.next(); continue; } String tString = luceneTerm.utf8ToString(); if(!allCandidates.contains(tString)) { luceneTerm=tiRef.next(); continue; } PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL); //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS); int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV if (doc != PostingsEnum.NO_MORE_DOCS) { int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { postingsEnum.nextPosition(); int start = postingsEnum.startOffset(); int end = postingsEnum.endOffset(); BytesRef payload=postingsEnum.getPayload(); int sentenceId=-1; if(payload!=null){ sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId(); } result.add(new MWESentenceContext(tString,sentenceId, start, end)); } } luceneTerm = tiRef.next(); } Collections.sort(result); return result; }
Example 9
Source File: TermPrefixCursor.java From SolrTextTagger with Apache License 2.0 | 5 votes |
/** Returns an IntsRef either cached or reading postingsEnum. Not null. * @param postingsEnum*/ private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException { // (The cache can have empty IntsRefs) //lookup prefixBuf in a cache if (docIdsCache != null) { docIds = docIdsCache.get(prefixBuf); if (docIds != null) { return docIds; } } //read postingsEnum docIds = new IntsRef(termsEnum.docFreq()); int docId; while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) { if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) { continue; } docIds.ints[docIds.length++] = docId; } if (docIds.length == 0) docIds = EMPTY_INTSREF; //cache if (docIdsCache != null) { ensureBufIsACopy(); //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to docIdsCache.put(prefixBuf.clone(), docIds); } return docIds; }
Example 10
Source File: LuceneIndexTest.java From rdf4j with BSD 3-Clause "New" or "Revised" License | 4 votes |
private static boolean next(PostingsEnum docs) throws IOException { return (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS); }
Example 11
Source File: DrillSidewaysScorer.java From lucene-solr with Apache License 2.0 | 4 votes |
/** Used when base query is highly constraining vs the * drilldowns, or when the docs must be scored at once * (i.e., like BooleanScorer2, not BooleanScorer). In * this case we just .next() on base and .advance() on * the dim filters. */ private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException { //if (DEBUG) { // System.out.println(" doQueryFirstScoring"); //} int docID = baseScorer.docID(); nextDoc: while (docID != PostingsEnum.NO_MORE_DOCS) { if (acceptDocs != null && acceptDocs.get(docID) == false) { docID = baseIterator.nextDoc(); continue; } LeafCollector failedCollector = null; for (DocsAndCost dim : dims) { // TODO: should we sort this 2nd dimension of // docsEnums from most frequent to least? if (dim.approximation.docID() < docID) { dim.approximation.advance(docID); } boolean matches = false; if (dim.approximation.docID() == docID) { if (dim.twoPhase == null) { matches = true; } else { matches = dim.twoPhase.matches(); } } if (matches == false) { if (failedCollector != null) { // More than one dim fails on this document, so // it's neither a hit nor a near-miss; move to // next doc: docID = baseIterator.nextDoc(); continue nextDoc; } else { failedCollector = dim.sidewaysLeafCollector; } } } collectDocID = docID; // TODO: we could score on demand instead since we are // daat here: collectScore = baseScorer.score(); if (failedCollector == null) { // Hit passed all filters, so it's "real": collectHit(collector, dims); } else { // Hit missed exactly one filter: collectNearMiss(failedCollector); } docID = baseIterator.nextDoc(); } }
Example 12
Source File: DocVectors.java From semanticvectors with BSD 3-Clause "New" or "Revised" License | 4 votes |
/** * Creates doc vectors, iterating over terms. */ private void trainDocVectors() throws IOException { VerbatimLogger.info("Building document vectors ... "); Enumeration<ObjectVector> termEnum = termVectors.getAllVectors(); try { int tc = 0; while (termEnum.hasMoreElements()) { // Output progress counter. if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) { VerbatimLogger.info("Processed " + tc + " terms ... "); } tc++; ObjectVector termVectorObject = termEnum.nextElement(); Vector termVector = termVectorObject.getVector(); String word = (String) termVectorObject.getObject(); // Go through checking terms for each fieldName. for (String fieldName : flagConfig.contentsfields()) { Term term = new Term(fieldName, word); float globalweight = luceneUtils.getGlobalTermWeight(term); float fieldweight = 1; // Get any docs for this term. PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term); // This may occur frequently if one term vector store is derived from multiple fields if (docsEnum == null) { continue; } while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) { String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID()); // Add vector from this term, taking freq into account. Vector docVector = this.docVectors.getVector(externalDocID); float localweight = docsEnum.freq(); if (flagConfig.fieldweight()) { //field weight: 1/sqrt(number of terms in field) TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator(); int numTerms = 0; while (terms.next() != null) { numTerms++; } fieldweight = (float) (1/Math.sqrt(numTerms)); } docVector.superpose( termVector, localweight * globalweight * fieldweight, null); } } } } catch (IOException e) { // catches from indexReader. e.printStackTrace(); } VerbatimLogger.info("\nNormalizing doc vectors ...\n"); Enumeration<ObjectVector> docEnum = docVectors.getAllVectors(); while (docEnum.hasMoreElements()) docEnum.nextElement().getVector().normalize(); }
Example 13
Source File: FrequencyCtxWindowBasedFBWorker.java From jate with GNU Lesser General Public License v3.0 | 4 votes |
private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup, Map<Integer, Integer> sentenceBoundaries) throws IOException { List<MWEInSentence> result = new ArrayList<>(); TermsEnum tiRef = termVectorLookup.iterator(); BytesRef luceneTerm = tiRef.next(); while (luceneTerm != null) { if (luceneTerm.length == 0) { luceneTerm = tiRef.next(); continue; } String tString = luceneTerm.utf8ToString(); if (!allCandidates.contains(tString)) { luceneTerm = tiRef.next(); continue; } PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL); //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS); int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV if (doc != PostingsEnum.NO_MORE_DOCS) { int totalOccurrence = postingsEnum.freq(); for (int i = 0; i < totalOccurrence; i++) { postingsEnum.nextPosition(); int start = postingsEnum.startOffset(); int end = postingsEnum.endOffset(); BytesRef payload = postingsEnum.getPayload(); SentenceContext sentenceContextInfo = null; if (payload != null) { sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())); } if (sentenceContextInfo == null) result.add(new MWEInSentence(tString, start, end, 0, 0, 0)); else { result.add(new MWEInSentence(tString, start, end, sentenceContextInfo.getFirstTokenIdx(), sentenceContextInfo.getLastTokenIdx(), sentenceContextInfo.getSentenceId())); Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId()); if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx()) sentenceBoundaries.put(sentenceContextInfo.getSentenceId(), sentenceContextInfo.getLastTokenIdx()); } } } luceneTerm = tiRef.next(); } Collections.sort(result); return result; }