org.apache.lucene.index.PostingsEnum#freq

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

6 votes

private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}

Source File: PhraseHelper.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void collectLeaf(PostingsEnum postings, int position, Term term) throws IOException {
  if (!fieldMatcher.test(term.field())) {
    return;
  }

  SpanCollectedOffsetsEnum offsetsEnum = termToOffsetsEnums.get(term.bytes());
  if (offsetsEnum == null) {
    // If it's pos insensitive we handle it outside of PhraseHelper.  term.field() is from the Query.
    if (positionInsensitiveTerms.contains(term.bytes())) {
      return;
    }
    offsetsEnum = new SpanCollectedOffsetsEnum(term.bytes(), postings.freq());
    termToOffsetsEnums.put(term.bytes(), offsetsEnum);
  }
  offsetsEnum.add(postings.startOffset(), postings.endOffset());
}

Source File: LuceneUtils.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

6 votes

/**
 * Gets the 1 - entropy (i.e. 1+ plogp) of a term,
 * a function that favors terms that are focally distributed
 * We use the definition of log-entropy weighting provided in
 * Martin and Berry (2007):
 * Entropy = 1 + sum ((Pij log2(Pij)) /  log2(n))
 * where Pij = frequency of term i in doc j / global frequency of term i
 * 		 n	 = number of documents in collection
 * @param term whose entropy you want
 * Thanks to Vidya Vasuki for adding the hash table to
 * eliminate redundant calculation
 */
private float getEntropy(Term term) {
  if (termEntropy.containsKey(term.field()+"_"+term.text()))
    return termEntropy.get(term.field()+"_"+term.text());
  int gf = getGlobalTermFreq(term);
  double entropy = 0;
  try {
    PostingsEnum docsEnum = this.getDocsForTerm(term);
    while ((docsEnum.nextDoc()) != PostingsEnum.NO_MORE_DOCS) {
      double p = docsEnum.freq(); //frequency in this document
      p = p / gf;    //frequency across all documents
      entropy += p * (Math.log(p) / Math.log(2)); //sum of Plog(P)
    }
    int n = this.getNumDocs();
    double log2n = Math.log(n) / Math.log(2);
    entropy = entropy / log2n;
  } catch (IOException e) {
    logger.info("Couldn't get term entropy for term " + term.text());
  }
  termEntropy.put(term.field()+"_"+term.text(), 1 + (float) entropy);
  return (float) (1 + entropy);
}

Source File: CodecCollector.java From mtas with Apache License 2.0

6 votes

/**
 * Compute termvector number full.
 *
 * @param docSet
 *          the doc set
 * @param termDocId
 *          the term doc id
 * @param termsEnum
 *          the terms enum
 * @param lrc
 *          the lrc
 * @param postingsEnum
 *          the postings enum
 * @param positionsData
 *          the positions data
 * @return the termvector number full
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberFull computeTermvectorNumberFull(
    List<Integer> docSet, int termDocId, TermsEnum termsEnum,
    LeafReaderContext lrc, PostingsEnum postingsEnum,
    Map<Integer, Integer> positionsData) throws IOException {
  TermvectorNumberFull result = new TermvectorNumberFull(docSet.size());
  Iterator<Integer> docIterator = docSet.iterator();
  int localTermDocId = termDocId;
  postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
  while (docIterator.hasNext()) {
    int docId = docIterator.next() - lrc.docBase;
    if (docId >= localTermDocId && ((docId == localTermDocId)
        || ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
      result.args[result.docNumber] = postingsEnum.freq();
      result.positions[result.docNumber] = (positionsData == null) ? 0
          : positionsData.get(docId + lrc.docBase);
      result.docNumber++;
    }
  }
  return result;
}

Source File: TermVectorEntry.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns a new term vector entry representing the specified term, and optionally, positions.
 *
 * @param te - positioned terms iterator
 * @return term vector entry
 * @throws IOException - if there is a low level IO error.
 */
static TermVectorEntry of(TermsEnum te) throws IOException {
  Objects.requireNonNull(te);

  String termText = BytesRefUtils.decode(te.term());

  List<TermVectorEntry.TermVectorPosition> tvPositions = new ArrayList<>();
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  pe.nextDoc();
  int freq = pe.freq();
  for (int i = 0; i < freq; i++) {
    int pos = pe.nextPosition();
    if (pos < 0) {
      // no position information available
      continue;
    }
    TermVectorPosition tvPos = TermVectorPosition.of(pos, pe);
    tvPositions.add(tvPos);
  }

  return new TermVectorEntry(termText, te.totalTermFreq(), tvPositions);
}

Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0

6 votes

/**
 * checks docs + freqs + positions + payloads, sequentially
 */
public void assertDocsAndPositionsEnum(PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
  assertNotNull(leftDocs);
  assertNotNull(rightDocs);
  assertEquals(-1, leftDocs.docID());
  assertEquals(-1, rightDocs.docID());
  int docid;
  while ((docid = leftDocs.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
    assertEquals(docid, rightDocs.nextDoc());
    int freq = leftDocs.freq();
    assertEquals(freq, rightDocs.freq());
    for (int i = 0; i < freq; i++) {
      assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
      // we don't assert offsets/payloads, they are allowed to be different
    }
  }
  assertEquals(DocIdSetIterator.NO_MORE_DOCS, rightDocs.nextDoc());
}

Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

5 votes

private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Source File: ESIndex.java From pyramid with Apache License 2.0

5 votes

private Map<Integer,String> getTermVectorWithException(String field, String id) throws IOException {
    TermVectorsResponse response = client.prepareTermVector(indexName, documentType, id)
            .setOffsets(false).setPositions(true).setFieldStatistics(false)
            .setTermStatistics(false)
            .setSelectedFields(field).
                    execute().actionGet();

    Map<Integer,String> map = new HashMap<>();
    Terms terms = response.getFields().terms(field);
    if (terms==null){
        return map;
    }
    TermsEnum iterator = terms.iterator();
    PostingsEnum postings = null;
    
    for (BytesRef termBytes = null; (termBytes = iterator.next()) != null; ) {
    	String term = termBytes.utf8ToString();
    	
    	postings = iterator.postings(postings, PostingsEnum.ALL);
    	
    	//there can only be one doc since we are getting with id. get the doc and the position 
    	postings.nextDoc();
    	
    	int tf = postings.freq();
    	
    	for (int i = 0; i < tf; i++) {
    		int pos = postings.nextPosition();
            map.put(pos,term);
    	}
    	
    }
    
    return map;
}

Source File: CodecCollector.java From mtas with Apache License 2.0

5 votes

/**
 * Compute termvector number basic.
 *
 * @param docSet
 *          the doc set
 * @param termDocId
 *          the term doc id
 * @param termsEnum
 *          the terms enum
 * @param r
 *          the r
 * @param lrc
 *          the lrc
 * @param postingsEnum
 *          the postings enum
 * @return the termvector number basic
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberBasic computeTermvectorNumberBasic(
    List<Integer> docSet, int termDocId, TermsEnum termsEnum, LeafReader r,
    LeafReaderContext lrc, PostingsEnum postingsEnum) throws IOException {
  TermvectorNumberBasic result = new TermvectorNumberBasic();
  boolean hasDeletedDocuments = (r.getLiveDocs() != null);
  if ((docSet.size() == r.numDocs()) && !hasDeletedDocuments) {
    try {
      return computeTermvectorNumberBasic(termsEnum, r);
    } catch (IOException e) {
      log.debug("problem", e);
      // problem
    }
  }
  result.docNumber = 0;
  result.valueSum[0] = 0;
  int localTermDocId = termDocId;
  Iterator<Integer> docIterator = docSet.iterator();
  postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
  int docId;
  while (docIterator.hasNext()) {
    docId = docIterator.next() - lrc.docBase;
    if (docId >= localTermDocId && ((docId == localTermDocId)
        || ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
      result.docNumber++;
      result.valueSum[0] += postingsEnum.freq();
    }
    if (localTermDocId == DocIdSetIterator.NO_MORE_DOCS) {
      break;
    }
  }
  return result;
}

Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0

5 votes

/**
 * checks advancing docs + positions
 */
public void assertPositionsSkipping(int docFreq, PostingsEnum leftDocs, PostingsEnum rightDocs) throws Exception {
  if (leftDocs == null || rightDocs == null) {
    assertNull(leftDocs);
    assertNull(rightDocs);
    return;
  }
  
  int docid = -1;
  int averageGap = MAXDOC / (1+docFreq);
  int skipInterval = 16;

  while (true) {
    if (random().nextBoolean()) {
      // nextDoc()
      docid = leftDocs.nextDoc();
      assertEquals(docid, rightDocs.nextDoc());
    } else {
      // advance()
      int skip = docid + (int) Math.ceil(Math.abs(skipInterval + random().nextGaussian() * averageGap));
      docid = leftDocs.advance(skip);
      assertEquals(docid, rightDocs.advance(skip));
    }
    
    if (docid == DocIdSetIterator.NO_MORE_DOCS) {
      return;
    }
    int freq = leftDocs.freq();
    assertEquals(freq, rightDocs.freq());
    for (int i = 0; i < freq; i++) {
      assertEquals(leftDocs.nextPosition(), rightDocs.nextPosition());
      // we don't compare the payloads, it's allowed that one is empty etc
    }
  }
}

Source File: TaxonomyIndexArrays.java From lucene-solr with Apache License 2.0

5 votes

private void initParents(IndexReader reader, int first) throws IOException {
  if (reader.maxDoc() == first) {
    return;
  }
  
  // it's ok to use MultiTerms because we only iterate on one posting list.
  // breaking it to loop over the leaves() only complicates code for no
  // apparent gain.
  PostingsEnum positions = MultiTerms.getTermPostingsEnum(reader,
      Consts.FIELD_PAYLOADS, Consts.PAYLOAD_PARENT_BYTES_REF,
      PostingsEnum.PAYLOADS);

  // shouldn't really happen, if it does, something's wrong
  if (positions == null || positions.advance(first) == DocIdSetIterator.NO_MORE_DOCS) {
    throw new CorruptIndexException("Missing parent data for category " + first, reader.toString());
  }
  
  int num = reader.maxDoc();
  for (int i = first; i < num; i++) {
    if (positions.docID() == i) {
      if (positions.freq() == 0) { // shouldn't happen
        throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
      }
      
      parents[i] = positions.nextPosition();
      
      if (positions.nextDoc() == DocIdSetIterator.NO_MORE_DOCS) {
        if (i + 1 < num) {
          throw new CorruptIndexException("Missing parent data for category "+ (i + 1), reader.toString());
        }
        break;
      }
    } else { // this shouldn't happen
      throw new CorruptIndexException("Missing parent data for category " + i, reader.toString());
    }
  }
}

Source File: TermMatchesIterator.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Create a new {@link TermMatchesIterator} for the given term and postings list
 */
TermMatchesIterator(Query query, PostingsEnum pe) throws IOException {
  this.pe = pe;
  this.query = query;
  this.upto = pe.freq();
}

Source File: TermIntervalsSource.java From lucene-solr with Apache License 2.0

4 votes

static IntervalMatchesIterator matches(TermsEnum te, int doc, String field) throws IOException {
  TermQuery query = new TermQuery(new Term(field, te.term()));
  PostingsEnum pe = te.postings(null, PostingsEnum.OFFSETS);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      if (upto <= 0) {
        pos = IntervalIterator.NO_MORE_INTERVALS;
        return false;
      }
      upto--;
      pos = pe.nextPosition();
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      return query;
    }
  };
}

Source File: PayloadFilteredTermIntervalsSource.java From lucene-solr with Apache License 2.0

4 votes

private IntervalMatchesIterator matches(TermsEnum te, int doc) throws IOException {
  PostingsEnum pe = te.postings(null, PostingsEnum.ALL);
  if (pe.advance(doc) != doc) {
    return null;
  }
  return new IntervalMatchesIterator() {

    @Override
    public int gaps() {
      return 0;
    }

    @Override
    public int width() {
      return 1;
    }

    int upto = pe.freq();
    int pos = -1;

    @Override
    public boolean next() throws IOException {
      do {
        if (upto <= 0) {
          pos = IntervalIterator.NO_MORE_INTERVALS;
          return false;
        }
        upto--;
        pos = pe.nextPosition();
      }
      while (filter.test(pe.getPayload()) == false);
      return true;
    }

    @Override
    public int startPosition() {
      return pos;
    }

    @Override
    public int endPosition() {
      return pos;
    }

    @Override
    public int startOffset() throws IOException {
      return pe.startOffset();
    }

    @Override
    public int endOffset() throws IOException {
      return pe.endOffset();
    }

    @Override
    public MatchesIterator getSubMatches() {
      return null;
    }

    @Override
    public Query getQuery() {
      throw new UnsupportedOperationException();
    }
  };
}

Source File: TestPerfTasksLogic.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Test ReadTokensTask
 */
public void testReadTokens() throws Exception {

  // We will call ReadTokens on this many docs
  final int NUM_DOCS = 20;

  // Read tokens from first NUM_DOCS docs from Reuters and
  // then build index from the same docs
  String algLines1[] = {
    "# ----- properties ",
    "analyzer=org.apache.lucene.analysis.core.WhitespaceAnalyzer",
    "content.source=org.apache.lucene.benchmark.byTask.feeds.LineDocSource",
    "docs.file=" + getReuters20LinesFile(),
    "# ----- alg ",
    "{ReadTokens}: " + NUM_DOCS,
    "ResetSystemErase",
    "CreateIndex",
    "{AddDoc}: " + NUM_DOCS,
    "CloseIndex",
  };

  // Run algo
  Benchmark benchmark = execBenchmark(algLines1);

  List<TaskStats> stats = benchmark.getRunData().getPoints().taskStats();

  // Count how many tokens all ReadTokens saw
  int totalTokenCount1 = 0;
  for (final TaskStats stat : stats) {
    if (stat.getTask().getName().equals("ReadTokens")) {
      totalTokenCount1 += stat.getCount();
    }
  }

  // Separately count how many tokens are actually in the index:
  IndexReader reader = DirectoryReader.open(benchmark.getRunData().getDirectory());
  assertEquals(NUM_DOCS, reader.numDocs());

  int totalTokenCount2 = 0;

  Collection<String> fields = FieldInfos.getIndexedFields(reader);

  for (String fieldName : fields) {
    if (fieldName.equals(DocMaker.ID_FIELD) || fieldName.equals(DocMaker.DATE_MSEC_FIELD) || fieldName.equals(DocMaker.TIME_SEC_FIELD)) {
      continue;
    }
    Terms terms = MultiTerms.getTerms(reader, fieldName);
    if (terms == null) {
      continue;
    }
    TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;
    while(termsEnum.next() != null) {
      docs = TestUtil.docs(random(), termsEnum, docs, PostingsEnum.FREQS);
      while(docs.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
        totalTokenCount2 += docs.freq();
      }
    }
  }
  reader.close();

  // Make sure they are the same
  assertEquals(totalTokenCount1, totalTokenCount2);
}

Source File: OffsetsEnum.java From lucene-solr with Apache License 2.0

4 votes

public OfPostings(BytesRef term, PostingsEnum postingsEnum) throws IOException {
  this(term, postingsEnum.freq(), postingsEnum);
}

Source File: DocVectors.java From semanticvectors with BSD 3-Clause "New" or "Revised" License

4 votes

/**
 * Creates doc vectors, iterating over terms.
 */
private void trainDocVectors() throws IOException {
  VerbatimLogger.info("Building document vectors ... ");
  Enumeration<ObjectVector> termEnum = termVectors.getAllVectors();
  try {
    int tc = 0;
    while (termEnum.hasMoreElements()) {
      // Output progress counter.
      if ((tc % 10000 == 0) || (tc < 10000 && tc % 1000 == 0)) {
        VerbatimLogger.info("Processed " + tc + " terms ... ");
      }
      tc++;

      ObjectVector termVectorObject = termEnum.nextElement();
      Vector termVector = termVectorObject.getVector();
      String word = (String) termVectorObject.getObject();

      // Go through checking terms for each fieldName.
      for (String fieldName : flagConfig.contentsfields()) {
        Term term = new Term(fieldName, word);
        float globalweight = luceneUtils.getGlobalTermWeight(term);
        float fieldweight = 1;

        // Get any docs for this term.
        PostingsEnum docsEnum = this.luceneUtils.getDocsForTerm(term);

        // This may occur frequently if one term vector store is derived from multiple fields
        if (docsEnum == null)  { continue; }

        while (docsEnum.nextDoc() != PostingsEnum.NO_MORE_DOCS) {
          String externalDocID = luceneUtils.getExternalDocId(docsEnum.docID());
          // Add vector from this term, taking freq into account.
          Vector docVector = this.docVectors.getVector(externalDocID);
          float localweight = docsEnum.freq();

          if (flagConfig.fieldweight()) {
            //field weight: 1/sqrt(number of terms in field)
            TermsEnum terms = luceneUtils.getTermVector(docsEnum.docID(), fieldName).iterator();
            int numTerms = 0;
            while (terms.next() != null) {
              numTerms++;
            }
            fieldweight = (float) (1/Math.sqrt(numTerms));
          }

          docVector.superpose(
              termVector, localweight * globalweight * fieldweight, null);
        }
      }
    }
  }
  catch (IOException e) { // catches from indexReader.
    e.printStackTrace();
  }

  VerbatimLogger.info("\nNormalizing doc vectors ...\n");
  
  Enumeration<ObjectVector> docEnum = docVectors.getAllVectors();
  while (docEnum.hasMoreElements())
  	docEnum.nextElement().getVector().normalize();
}

Source File: VectorScoreQuery.java From solr-vector-scoring with Apache License 2.0

4 votes

@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
	return new CustomScoreProvider(context){
		@Override
		public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
			float score = 0;
			double docVectorNorm = 0;
			LeafReader reader = context.reader();
			Terms terms = reader.getTermVector(docID, field);
			if(vector.size() != terms.size()){
				throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
			}
			TermsEnum iter = terms.iterator();
		    BytesRef text;
		    while ((text = iter.next()) != null) {
		    	String term = text.utf8ToString();
		    	float payloadValue = 0f;
		    	PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
		    	while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
		    		int freq = postings.freq();
		    		while (freq-- > 0) postings.nextPosition();

		    		BytesRef payload = postings.getPayload();
		    		payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 
		    		
		    		if (cosine)
		              docVectorNorm += Math.pow(payloadValue, 2.0);
		    	}
		    		
		    	score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
		    }
		    
		    if (cosine) {
		      if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
		      return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
		    }

			return score;
		}
	};
}

Source File: FrequencyCtxWindowBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

4 votes

private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Source File: FilterableTermsEnum.java From Elasticsearch with Apache License 2.0

4 votes

@Override
public boolean seekExact(BytesRef text) throws IOException {
    int docFreq = 0;
    long totalTermFreq = 0;
    for (Holder anEnum : enums) {
        if (anEnum.termsEnum.seekExact(text)) {
            if (anEnum.bits == null) {
                docFreq += anEnum.termsEnum.docFreq();
                if (docsEnumFlag == PostingsEnum.FREQS) {
                    long leafTotalTermFreq = anEnum.termsEnum.totalTermFreq();
                    if (totalTermFreq == -1 || leafTotalTermFreq == -1) {
                        totalTermFreq = -1;
                        continue;
                    }
                    totalTermFreq += leafTotalTermFreq;
                }
            } else {
                final PostingsEnum docsEnum = anEnum.docsEnum = anEnum.termsEnum.postings(anEnum.docsEnum, docsEnumFlag);
                // 2 choices for performing same heavy loop - one attempts to calculate totalTermFreq and other does not
                if (docsEnumFlag == PostingsEnum.FREQS) {
                    for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                        if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
                            continue;
                        }
                        docFreq++;
                        // docsEnum.freq() returns 1 if doc indexed with IndexOptions.DOCS_ONLY so no way of knowing if value
                        // is really 1 or unrecorded when filtering like this
                        totalTermFreq += docsEnum.freq();
                    }
                } else {
                    for (int docId = docsEnum.nextDoc(); docId != DocIdSetIterator.NO_MORE_DOCS; docId = docsEnum.nextDoc()) {
                        if (anEnum.bits != null && anEnum.bits.get(docId) == false) {
                            continue;
                        }
                        // docsEnum.freq() behaviour is undefined if docsEnumFlag==PostingsEnum.FLAG_NONE so don't bother with call
                        docFreq++;
                    }
                }
            }
        }
    }
    if (docFreq > 0) {
        currentDocFreq = docFreq;
        currentTotalTermFreq = totalTermFreq;
        current = text;
        return true;
    } else {
        currentDocFreq = NOT_FOUND;
        currentTotalTermFreq = NOT_FOUND;
        current = null;
        return false;
    }
}

Java Code Examples for org.apache.lucene.index.PostingsEnum#freq()