org.apache.lucene.index.PostingsEnum#NO_MORE

Source File: FieldFeatureTFExtractorFactory.java From ltr4l with Apache License 2.0

6 votes

@Override
public FieldFeatureExtractor[] create(LeafReaderContext context, Set<Integer> allDocs) throws IOException {
  FieldFeatureExtractor[] extractors = new FieldFeatureExtractor[terms.length];
  int i = 0;
  for(Term term: terms){
    final TermsEnum termsEnum = getTermsEnum(context, term);
    if (termsEnum == null) {
      extractors[i] = new FieldFeatureNullExtractor();
    }
    else{
      extractors[i] = new FieldFeatureTFExtractor(termsEnum.postings(null, PostingsEnum.FREQS));
      // get it twice without reuse to clone it...
      PostingsEnum docs = termsEnum.postings(null, PostingsEnum.FREQS);
      for(int docId = docs.nextDoc(); docId != PostingsEnum.NO_MORE_DOCS; docId = docs.nextDoc()){
        allDocs.add(docId);
      }
    }
    i++;
  }
  return extractors;
}

Source File: LuceneIndex.java From rdf4j with BSD 3-Clause "New" or "Revised" License

6 votes

private static Document getDocument(LeafReader reader, Term term) throws IOException {
	PostingsEnum docs = reader.postings(term);
	if (docs != null) {
		int docId = docs.nextDoc();
		// PostingsEnum may contain deleted documents, we have to cope for it
		while (docId != PostingsEnum.NO_MORE_DOCS) {

			// if document is deleted, skip and continue
			Bits liveDocs = reader.getLiveDocs();
			if (liveDocs != null && !liveDocs.get(docId)) {
				docId = docs.nextDoc();
				continue;
			}
			if (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
				throw new IllegalStateException("Multiple Documents for term " + term.text());
			}
			return readDocument(reader, docId, null);
		}
	}
	return null;
}

Source File: DocumentsImpl.java From lucene-solr with Apache License 2.0

6 votes

@Override
public Optional<Integer> firstTermDoc() {
  if (tenum == null) {
    // terms enum is not set
    log.warn("Terms enum un-positioned.");
    return Optional.empty();
  }

  try {
    setPostingsIterator(tenum.postings(penum, PostingsEnum.ALL));

    if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) {
      // no docs available for this term
      resetPostingsIterator();
      log.warn("No docs available for term: {} in field: {}.", BytesRefUtils.decode(tenum.term()), curField);
      return Optional.empty();
    } else {
      return Optional.of(penum.docID());
    }
  } catch (IOException e) {
    resetPostingsIterator();
    throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e);
  }
}

Source File: DocumentsImpl.java From lucene-solr with Apache License 2.0

6 votes

@Override
public Optional<Integer> nextTermDoc() {
  if (penum == null) {
    // postings enum is not initialized
    log.warn("Postings enum un-positioned for field: {}.", curField);
    return Optional.empty();
  }

  try {
    if (penum.nextDoc() == PostingsEnum.NO_MORE_DOCS) {
      // end of the iterator
      resetPostingsIterator();
      if (log.isInfoEnabled()) {
        log.info("Reached the end of the postings iterator for term: {} in field: {}", BytesRefUtils.decode(tenum.term()), curField);
      }
      return Optional.empty();
    } else {
      return Optional.of(penum.docID());
    }
  } catch (IOException e) {
    resetPostingsIterator();
    throw new LukeException(String.format(Locale.ENGLISH, "Term docs not available for field: %s.", curField), e);
  }
}

Source File: TestIDVersionPostingsFormat.java From lucene-solr with Apache License 2.0

6 votes

/** Returns docID if found, else -1. */
public int lookup(BytesRef id, long version) throws IOException {
  for(int seg=0;seg<numSegs;seg++) {
    if (((IDVersionSegmentTermsEnum) termsEnums[seg]).seekExact(id, version)) {
      if (VERBOSE) {
        System.out.println("  found in seg=" + termsEnums[seg]);
      }
      postingsEnums[seg] = termsEnums[seg].postings(postingsEnums[seg], 0);
      int docID = postingsEnums[seg].nextDoc();
      if (docID != PostingsEnum.NO_MORE_DOCS && (liveDocs[seg] == null || liveDocs[seg].get(docID))) {
        lastVersion = ((IDVersionSegmentTermsEnum) termsEnums[seg]).getVersion();
        return docBases[seg] + docID;
      }
      assert hasDeletions;
    }
  }

  return -1;
}

Source File: LuceneUtils.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Gets the 1 - entropy (i.e. 1+ plogp) of a term,
 * a function that favors terms that are focally distributed
 * We use the definition of log-entropy weighting provided in
 * Martin and Berry (2007):
 * Entropy = 1 + sum ((Pij log2(Pij)) /  log2(n))
 * where Pij = frequency of term i in doc j / global frequency of term i
 * 		 n	 = number of documents in collection
 * @param term whose entropy you want
 * Thanks to Vidya Vasuki for adding the hash table to
 * eliminate redundant calculation
 */
private float getEntropy(Term term) {
  if (termEntropy.containsKey(term.field()+"_"+term.text()))
    return termEntropy.get(term.field()+"_"+term.text());
  int gf = getGlobalTermFreq(term);
  double entropy = 0;
  try {
    PostingsEnum docsEnum = this.getDocsForTerm(term);
    while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
      double p = docsEnum.freq(); //frequency in this document
      p = p / gf;    //frequency across all documents
      entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
    }
    int n = this.getNumDocs();
    double log2n = Math.log(n) / Math.log(2);
    entropy = entropy / log2n;
  } catch (IOException e) {
    logger.info("Couldn't get term entropy for term " + term.text());
  }
  termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy);
  return (float) (1 + entropy);
}

Source File: LuceneIndex.java From rdf4j with BSD 3-Clause "New" or "Revised" License

5 votes

private static void addDocuments(LeafReader reader, Term term, Collection<Document> documents) throws IOException {
	PostingsEnum docs = reader.postings(term);
	if (docs != null) {
		int docId;
		while ((docId = docs.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
			Bits liveDocs = reader.getLiveDocs();
			// Maybe some of the docs have been deleted! Check that too..
			if (liveDocs != null && !liveDocs.get(docId)) {
				continue;
			}
			Document document = readDocument(reader, docId, null);
			documents.add(document);
		}
	}
}

Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

5 votes

private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Source File: TermPrefixCursor.java From SolrTextTagger with Apache License 2.0

5 votes

/** Returns an IntsRef either cached or reading postingsEnum. Not null.
 * @param postingsEnum*/
private IntsRef postingsEnumToIntsRef(PostingsEnum postingsEnum, Bits liveDocs) throws IOException {
  // (The cache can have empty IntsRefs)

  //lookup prefixBuf in a cache
  if (docIdsCache != null) {
    docIds = docIdsCache.get(prefixBuf);
    if (docIds != null) {
      return docIds;
    }
  }

  //read postingsEnum
  docIds = new IntsRef(termsEnum.docFreq());
  int docId;
  while ((docId = postingsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
    if (liveDocs != null && !liveDocs.get(postingsEnum.docID())) {
      continue;
    }
    docIds.ints[docIds.length++] = docId;
  }
  if (docIds.length == 0)
    docIds = EMPTY_INTSREF;

  //cache
  if (docIdsCache != null) {
    ensureBufIsACopy();
    //clone is shallow; that's okay as the prefix isn't overwritten; it's just appended to
    docIdsCache.put(prefixBuf.clone(), docIds);
  }
  return docIds;
}

Source File: LuceneIndexTest.java From rdf4j with BSD 3-Clause "New" or "Revised" License

4 votes

private static boolean next(PostingsEnum docs) throws IOException {
	return (docs.nextDoc() != PostingsEnum.NO_MORE_DOCS);
}

Source File: DrillSidewaysScorer.java From lucene-solr with Apache License 2.0

4 votes

/** Used when base query is highly constraining vs the
 *  drilldowns, or when the docs must be scored at once
 *  (i.e., like BooleanScorer2, not BooleanScorer).  In
 *  this case we just .next() on base and .advance() on
 *  the dim filters. */ 
private void doQueryFirstScoring(Bits acceptDocs, LeafCollector collector, DocsAndCost[] dims) throws IOException {
  //if (DEBUG) {
  //  System.out.println("  doQueryFirstScoring");
  //}
  int docID = baseScorer.docID();

  nextDoc: while (docID != PostingsEnum.NO_MORE_DOCS) {
    if (acceptDocs != null && acceptDocs.get(docID) == false) {
      docID = baseIterator.nextDoc();
      continue;
    }
    LeafCollector failedCollector = null;
    for (DocsAndCost dim : dims) {
      // TODO: should we sort this 2nd dimension of
      // docsEnums from most frequent to least?
      if (dim.approximation.docID() < docID) {
        dim.approximation.advance(docID);
      }

      boolean matches = false;
      if (dim.approximation.docID() == docID) {
        if (dim.twoPhase == null) {
          matches = true;
        } else {
          matches = dim.twoPhase.matches();
        }
      }

      if (matches == false) {
        if (failedCollector != null) {
          // More than one dim fails on this document, so
          // it's neither a hit nor a near-miss; move to
          // next doc:
          docID = baseIterator.nextDoc();
          continue nextDoc;
        } else {
          failedCollector = dim.sidewaysLeafCollector;
        }
      }
    }

    collectDocID = docID;

    // TODO: we could score on demand instead since we are
    // daat here:
    collectScore = baseScorer.score();

    if (failedCollector == null) {
      // Hit passed all filters, so it's "real":
      collectHit(collector, dims);
    } else {
      // Hit missed exactly one filter:
      collectNearMiss(failedCollector);
    }

    docID = baseIterator.nextDoc();
  }
}

Source File: DocVectors.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * Creates doc vectors, iterating over terms.
 */
private void trainDocVectors() throws IOException {
  VerbatimLogger.info("Building document vectors ... ");
  Enumeration<ObjectVector> termEnum = termVectors.getAllVectors();
  try {
    int tc = 0;
    while (termEnum.hasMoreElements()) {
      // Output progress counter.
      if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) {
        VerbatimLogger.info("Processed " + tc + " terms ... ");
      }
      tc++;

      ObjectVector termVectorObject = termEnum.nextElement();
      Vector termVector = termVectorObject.getVector();
      String word = (String) termVectorObject.getObject();

      // Go through checking terms for each fieldName.
      for (String fieldName : flagConfig.contentsfields()) {
        Term term = new Term(fieldName, word);
        float globalweight = luceneUtils.getGlobalTermWeight(term);
        float fieldweight = 1;

        // Get any docs for this term.
        PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);

        // This may occur frequently if one term vector store is derived from multiple fields
        if (docsEnum == null)  { continue; }

        while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
          String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID());
          // Add vector from this term, taking freq into account.
          Vector docVector = this.docVectors.getVector(externalDocID);
          float localweight = docsEnum.freq();

          if (flagConfig.fieldweight()) {
            //field weight: 1/sqrt(number of terms in field)
            TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator();
            int numTerms = 0;
            while (terms.next() != null) {
              numTerms++;
            }
            fieldweight = (float) (1/Math.sqrt(numTerms));
          }

          docVector.superpose(
              termVector, localweight * globalweight * fieldweight, null);
        }
      }
    }
  }
  catch (IOException e) { // catches from indexReader.
    e.printStackTrace();
  }

  VerbatimLogger.info("\nNormalizing doc vectors ...\n");
  
  Enumeration<ObjectVector> docEnum = docVectors.getAllVectors();
  while (docEnum.hasMoreElements())
  	docEnum.nextElement().getVector().normalize();
}

Source File: FrequencyCtxWindowBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

4 votes

private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Java Code Examples for org.apache.lucene.index.PostingsEnum#NO_MORE_DOCS