org.apache.lucene.index.TermsEnum#postings

Source File: LukeRequestHandler.java From lucene-solr with Apache License 2.0

6 votes

private static Document getFirstLiveDoc(Terms terms, LeafReader reader) throws IOException {
  PostingsEnum postingsEnum = null;
  TermsEnum termsEnum = terms.iterator();
  BytesRef text;
  // Deal with the chance that the first bunch of terms are in deleted documents. Is there a better way?
  for (int idx = 0; idx < 1000 && postingsEnum == null; ++idx) {
    text = termsEnum.next();
    if (text == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
      return null;
    }
    postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
    final Bits liveDocs = reader.getLiveDocs();
    if (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      if (liveDocs != null && liveDocs.get(postingsEnum.docID())) {
        continue;
      }
      return reader.document(postingsEnum.docID());
    }
  }
  return null;
}

Source File: CodecCollector.java From mtas with Apache License 2.0

6 votes

/**
 * Collect collection.
 *
 * @param reader
 *          the reader
 * @param docSet
 *          the doc set
 * @param collectionInfo
 *          the collection info
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
public static void collectCollection(IndexReader reader, List<Integer> docSet,
    ComponentCollection collectionInfo) throws IOException {
  if (collectionInfo.action().equals(ComponentCollection.ACTION_CHECK)) {
    // can't do anything in lucene for check
  } else if (collectionInfo.action()
      .equals(ComponentCollection.ACTION_LIST)) {
    // can't do anything in lucene for list
  } else if (collectionInfo.action()
      .equals(ComponentCollection.ACTION_CREATE)) {
    BytesRef term = null;
    PostingsEnum postingsEnum = null;
    Integer docId;
    Integer termDocId = -1;
    Terms terms;
    LeafReaderContext lrc;
    LeafReader r;
    ListIterator<LeafReaderContext> iterator = reader.leaves().listIterator();
    while (iterator.hasNext()) {
      lrc = iterator.next();
      r = lrc.reader();
      for (String field : collectionInfo.fields()) {
        if ((terms = r.terms(field)) != null) {
          TermsEnum termsEnum = terms.iterator();
          while ((term = termsEnum.next()) != null) {
            Iterator<Integer> docIterator = docSet.iterator();
            postingsEnum = termsEnum.postings(postingsEnum,
                PostingsEnum.NONE);
            termDocId = -1;
            while (docIterator.hasNext()) {
              docId = docIterator.next() - lrc.docBase;
              if ((docId >= termDocId) && ((docId.equals(termDocId))
                  || ((termDocId = postingsEnum.advance(docId))
                      .equals(docId)))) {
                collectionInfo.addValue(term.utf8ToString());
                break;
              }
              if (termDocId.equals(PostingsEnum.NO_MORE_DOCS)) {
                break;
              }
            }
          }
        }
      }
    }
  }
}

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

6 votes

private void buildTerm(XContentBuilder builder, final CharsRefBuilder spare, Terms curTerms, TermsEnum termIter, BoostAttribute boostAtt) throws IOException {
    // start term, optimized writing
    BytesRef term = termIter.next();
    spare.copyUTF8Bytes(term);
    builder.startObject(spare.toString());
    buildTermStatistics(builder, termIter);
    // finally write the term vectors
    PostingsEnum posEnum = termIter.postings(null, PostingsEnum.ALL);
    int termFreq = posEnum.freq();
    builder.field(FieldStrings.TERM_FREQ, termFreq);
    initMemory(curTerms, termFreq);
    initValues(curTerms, posEnum, termFreq);
    buildValues(builder, curTerms, termFreq);
    buildScore(builder, boostAtt);
    builder.endObject();
}

Source File: TestUtil.java From lucene-solr with Apache License 2.0

6 votes

public static PostingsEnum docs(Random random, TermsEnum termsEnum, PostingsEnum reuse, int flags) throws IOException {
  // TODO: simplify this method? it would be easier to randomly either use the flags passed, or do the random selection,
  // FREQS should be part fo the random selection instead of outside on its own?
  if (random.nextBoolean()) {
    if (random.nextBoolean()) {
      final int posFlags;
      switch (random.nextInt(4)) {
        case 0: posFlags = PostingsEnum.POSITIONS; break;
        case 1: posFlags = PostingsEnum.OFFSETS; break;
        case 2: posFlags = PostingsEnum.PAYLOADS; break;
        default: posFlags = PostingsEnum.ALL; break;
      }
      return termsEnum.postings(null, posFlags);
    }
    flags |= PostingsEnum.FREQS;
  }
  return termsEnum.postings(reuse, flags);
}

Source File: ShardSplittingQuery.java From crate with Apache License 2.0

6 votes

private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader,
                                  IntConsumer consumer) throws IOException {
    Terms terms = leafReader.terms(idField);
    TermsEnum iterator = terms.iterator();
    BytesRef idTerm;
    PostingsEnum postingsEnum = null;
    while ((idTerm = iterator.next()) != null) {
        if (includeInShard.test(idTerm) == false) {
            postingsEnum = iterator.postings(postingsEnum);
            int doc;
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                consumer.accept(doc);
            }
        }
    }
}

Source File: TermsIncludingScoreQuery.java From lucene-solr with Apache License 2.0

6 votes

@Override
protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException {
  BytesRef spare = new BytesRef();
  PostingsEnum postingsEnum = null;
  for (int i = 0; i < terms.size(); i++) {
    if (termsEnum.seekExact(terms.get(ords[i], spare))) {
      postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
      float score = TermsIncludingScoreQuery.this.scores[ords[i]];
      for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) {
        // I prefer this:
        /*if (scores[doc] < score) {
          scores[doc] = score;
          matchingDocs.set(doc);
        }*/
        // But this behaves the same as MVInnerScorer and only then the tests will pass:
        if (!matchingDocs.get(doc)) {
          scores[doc] = score;
          matchingDocs.set(doc);
        }
      }
    }
  }
}

Source File: TermsIncludingScoreQuery.java From lucene-solr with Apache License 2.0

6 votes

protected void fillDocsAndScores(FixedBitSet matchingDocs, TermsEnum termsEnum) throws IOException {
  BytesRef spare = new BytesRef();
  PostingsEnum postingsEnum = null;
  for (int i = 0; i < terms.size(); i++) {
    if (termsEnum.seekExact(terms.get(ords[i], spare))) {
      postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
      float score = TermsIncludingScoreQuery.this.scores[ords[i]];
      for (int doc = postingsEnum.nextDoc(); doc != DocIdSetIterator.NO_MORE_DOCS; doc = postingsEnum.nextDoc()) {
        matchingDocs.set(doc);
        // In the case the same doc is also related to a another doc, a score might be overwritten. I think this
        // can only happen in a many-to-many relation
        scores[doc] = score;
      }
    }
  }
}

Source File: CodecCollector.java From mtas with Apache License 2.0

5 votes

/**
 * Compute termvector number basic.
 *
 * @param docSet
 *          the doc set
 * @param termDocId
 *          the term doc id
 * @param termsEnum
 *          the terms enum
 * @param r
 *          the r
 * @param lrc
 *          the lrc
 * @param postingsEnum
 *          the postings enum
 * @return the termvector number basic
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
private static TermvectorNumberBasic computeTermvectorNumberBasic(
    List<Integer> docSet, int termDocId, TermsEnum termsEnum, LeafReader r,
    LeafReaderContext lrc, PostingsEnum postingsEnum) throws IOException {
  TermvectorNumberBasic result = new TermvectorNumberBasic();
  boolean hasDeletedDocuments = (r.getLiveDocs() != null);
  if ((docSet.size() == r.numDocs()) && !hasDeletedDocuments) {
    try {
      return computeTermvectorNumberBasic(termsEnum, r);
    } catch (IOException e) {
      log.debug("problem", e);
      // problem
    }
  }
  result.docNumber = 0;
  result.valueSum[0] = 0;
  int localTermDocId = termDocId;
  Iterator<Integer> docIterator = docSet.iterator();
  postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.FREQS);
  int docId;
  while (docIterator.hasNext()) {
    docId = docIterator.next() - lrc.docBase;
    if (docId >= localTermDocId && ((docId == localTermDocId)
        || ((localTermDocId = postingsEnum.advance(docId)) == docId))) {
      result.docNumber++;
      result.valueSum[0] += postingsEnum.freq();
    }
    if (localTermDocId == DocIdSetIterator.NO_MORE_DOCS) {
      break;
    }
  }
  return result;
}

Source File: FrequencyCtxSentenceBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

5 votes

private List<MWESentenceContext> collectTermOffsets(Terms termVectorLookup) throws IOException {
    List<MWESentenceContext> result = new ArrayList<>();

    TermsEnum tiRef= termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if(!allCandidates.contains(tString)) {
            luceneTerm=tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload=postingsEnum.getPayload();
                int sentenceId=-1;
                if(payload!=null){
                    sentenceId=new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString())).getSentenceId();
                }
                result.add(new MWESentenceContext(tString,sentenceId, start, end));
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Source File: DocSetBuilder.java From lucene-solr with Apache License 2.0

5 votes

/** Returns the number of terms visited */
public int add(TermsEnum te, int base) throws IOException {
  PostingsEnum postings = null;

  int termCount = 0;
  for(;;) {
    BytesRef term = te.next();
    if (term == null) break;
    termCount++;
    postings = te.postings(postings, PostingsEnum.NONE);
    add(postings, base);
  }

  return termCount;
}

Source File: FieldOffsetStrategy.java From lucene-solr with Apache License 2.0

5 votes

protected void createOffsetsEnumsForTerms(BytesRef[] sourceTerms, Terms termsIndex, int doc, List<OffsetsEnum> results) throws IOException {
  TermsEnum termsEnum = termsIndex.iterator();//does not return null
  for (BytesRef term : sourceTerms) {
    if (termsEnum.seekExact(term)) {
      PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.OFFSETS);
      if (postingsEnum == null) {
        // no offsets or positions available
        throw new IllegalArgumentException("field '" + getField() + "' was indexed without offsets, cannot highlight");
      }
      if (doc == postingsEnum.advance(doc)) { // now it's positioned, although may be exhausted
        results.add(new OffsetsEnum.OfPostings(term, postingsEnum));
      }
    }
  }
}

Source File: AlfrescoLukeRequestHandler.java From SearchServices with GNU Lesser General Public License v3.0

5 votes

protected static Document getFirstLiveDoc(Terms terms, LeafReader reader)
		throws IOException {
	TermsEnum termsEnum = terms.iterator();
	if (termsEnum.next() == null) { // Ran off the end of the terms enum without finding any live docs with that field in them.
		return null;
	}
	PostingsEnum postingsEnum = termsEnum.postings(null, PostingsEnum.NONE);
	final Bits liveDocs = reader.getLiveDocs();
	if (postingsEnum.nextDoc() == DocIdSetIterator.NO_MORE_DOCS
			|| (liveDocs != null && liveDocs.get(postingsEnum.docID()))) {
		return null;
	}
	return reader.document(postingsEnum.docID());
}

Source File: FeatureDoubleValuesSource.java From lucene-solr with Apache License 2.0

5 votes

@Override
public DoubleValues getValues(LeafReaderContext ctx, DoubleValues scores) throws IOException {
  Terms terms = ctx.reader().terms(field);
  if (terms == null) {
    return DoubleValues.EMPTY;
  } else {
    TermsEnum termsEnum = terms.iterator();
    if (termsEnum.seekExact(featureName) == false) {
      return DoubleValues.EMPTY;
    } else {
      PostingsEnum currentReaderPostingsValues = termsEnum.postings(null, PostingsEnum.FREQS);
      return new FeatureDoubleValues(currentReaderPostingsValues);
    }
  }
}

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  // TODO: would be nice to first check if DocTermsIndex
  // was already cached for this field and then return
  // that instead, to avoid insanity

  final int maxDoc = reader.maxDoc();
  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final int termCountHardLimit = maxDoc;

  // Holds the actual term data, expanded.
  final PagedBytes bytes = new PagedBytes(15);

  int startBPV;

  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > termCountHardLimit) {
        numUniqueTerms = termCountHardLimit;
      }
      startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
    } else {
      startBPV = 1;
    }
  } else {
    startBPV = 1;
  }

  final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
  
  // pointer==0 means not set
  bytes.copyUsingLengthPrefix(new BytesRef());

  if (terms != null) {
    int termCount = 0;
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;
    while(true) {
      if (termCount++ == termCountHardLimit) {
        // app is misusing the API (there is more than
        // one term per doc); in this case we make best
        // effort to load what we can (see LUCENE-2142)
        break;
      }

      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      final long pointer = bytes.copyUsingLengthPrefix(term);
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        docToOffset.set(docID, pointer);
      }
    }
  }

  final PackedInts.Reader offsetReader = docToOffset.getMutable();
  Bits docsWithField = new Bits() {
    @Override
    public boolean get(int index) {
      return offsetReader.get(index) != 0;
    }

    @Override
    public int length() {
      return maxDoc;
    }
  };

  wrapper.setDocsWithField(reader, key.field, docsWithField, null);
  // maybe an int-only impl?
  return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}

Source File: IGainTermsQParserPlugin.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void finish() throws IOException {
  NamedList<Double> analytics = new NamedList<Double>();
  @SuppressWarnings({"unchecked", "rawtypes"})
  NamedList<Integer> topFreq = new NamedList();

  @SuppressWarnings({"unchecked", "rawtypes"})
  NamedList<Integer> allFreq = new NamedList();

  rb.rsp.add("featuredTerms", analytics);
  rb.rsp.add("docFreq", topFreq);
  rb.rsp.add("numDocs", count);

  TreeSet<TermWithScore> topTerms = new TreeSet<>();

  double numDocs = count;
  double pc = numPositiveDocs / numDocs;
  double entropyC = binaryEntropy(pc);

  Terms terms = ((SolrIndexSearcher)searcher).getSlowAtomicReader().terms(field);
  TermsEnum termsEnum = terms == null ? TermsEnum.EMPTY : terms.iterator();
  BytesRef term;
  PostingsEnum postingsEnum = null;
  while ((term = termsEnum.next()) != null) {
    postingsEnum = termsEnum.postings(postingsEnum);
    int xc = 0;
    int nc = 0;
    while (postingsEnum.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
      if (positiveSet.get(postingsEnum.docID())) {
        xc++;
      } else if (negativeSet.get(postingsEnum.docID())) {
        nc++;
      }
    }

    int docFreq = xc+nc;

    double entropyContainsTerm = binaryEntropy( (double) xc / docFreq );
    double entropyNotContainsTerm = binaryEntropy( (double) (numPositiveDocs - xc) / (numDocs - docFreq + 1) );
    double score = entropyC - ( (docFreq / numDocs) * entropyContainsTerm + (1.0 - docFreq / numDocs) * entropyNotContainsTerm);

    topFreq.add(term.utf8ToString(), docFreq);
    if (topTerms.size() < numTerms) {
      topTerms.add(new TermWithScore(term.utf8ToString(), score));
    } else  {
      if (topTerms.first().score < score) {
        topTerms.pollFirst();
        topTerms.add(new TermWithScore(term.utf8ToString(), score));
      }
    }
  }

  for (TermWithScore topTerm : topTerms) {
    analytics.add(topTerm.term, topTerm.score);
    topFreq.add(topTerm.term, allFreq.get(topTerm.term));
  }

  if (this.delegate instanceof DelegatingCollector) {
    ((DelegatingCollector) this.delegate).finish();
  }
}

Source File: TermVectorComponent.java From lucene-solr with Apache License 2.0

4 votes

private void mapOneVector(NamedList<Object> docNL, FieldOptions fieldOptions, IndexReader reader, int docID, TermsEnum termsEnum, String field) throws IOException {
  NamedList<Object> fieldNL = new NamedList<>();
  docNL.add(field, fieldNL);

  BytesRef text;
  PostingsEnum dpEnum = null;
  while((text = termsEnum.next()) != null) {
    String term = text.utf8ToString();
    NamedList<Object> termInfo = new NamedList<>();
    fieldNL.add(term, termInfo);
    final int freq = (int) termsEnum.totalTermFreq();
    if (fieldOptions.termFreq == true) {
      termInfo.add("tf", freq);
    }

    int dpEnumFlags = 0;
    dpEnumFlags |= fieldOptions.positions ? PostingsEnum.POSITIONS : 0;
    //payloads require offsets
    dpEnumFlags |= (fieldOptions.offsets || fieldOptions.payloads) ? PostingsEnum.OFFSETS : 0;
    dpEnumFlags |= fieldOptions.payloads ? PostingsEnum.PAYLOADS : 0;
    dpEnum = termsEnum.postings(dpEnum, dpEnumFlags);

    boolean atNextDoc = false;
    if (dpEnum != null) {
      dpEnum.nextDoc();
      atNextDoc = true;
    }

    if (atNextDoc && dpEnumFlags != 0) {
      NamedList<Integer> positionsNL = null;
      NamedList<Number> theOffsets = null;
      NamedList<String> thePayloads = null;

      for (int i = 0; i < freq; i++) {
        final int pos = dpEnum.nextPosition();
        if (fieldOptions.positions && pos >= 0) {
          if (positionsNL == null) {
            positionsNL = new NamedList<>();
            termInfo.add("positions", positionsNL);
          }
          positionsNL.add("position", pos);
        }

        int startOffset = fieldOptions.offsets ? dpEnum.startOffset() : -1;
        if (startOffset >= 0) {
          if (theOffsets == null) {
            theOffsets = new NamedList<>();
            termInfo.add("offsets", theOffsets);
          }
          theOffsets.add("start", dpEnum.startOffset());
          theOffsets.add("end", dpEnum.endOffset());
        }

        BytesRef payload = fieldOptions.payloads ? dpEnum.getPayload() : null;
        if (payload != null) {
          if (thePayloads == null) {
            thePayloads = new NamedList<>();
            termInfo.add("payloads", thePayloads);
          }
          thePayloads.add("payload", Base64.byteArrayToBase64(payload.bytes, payload.offset, payload.length));
        }
      }
    }
    
    int df = 0;
    if (fieldOptions.docFreq || fieldOptions.tfIdf) {
      df = reader.docFreq(new Term(field, text));
    }

    if (fieldOptions.docFreq) {
      termInfo.add("df", df);
    }

    // TODO: this is not TF/IDF by anyone's definition!
    if (fieldOptions.tfIdf) {
      double tfIdfVal = ((double) freq) / df;
      termInfo.add("tf-idf", tfIdfVal);
    }
  }
}

Source File: DirectoryTaxonomyWriter.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Look up the given category in the cache and/or the on-disk storage,
 * returning the category's ordinal, or a negative number in case the
 * category does not yet exist in the taxonomy.
 */
protected synchronized int findCategory(FacetLabel categoryPath) throws IOException {
  // If we can find the category in the cache, or we know the cache is
  // complete, we can return the response directly from it
  int res = cache.get(categoryPath);
  if (res >= 0 || cacheIsComplete) {
    return res;
  }

  cacheMisses.incrementAndGet();
  // After a few cache misses, it makes sense to read all the categories
  // from disk and into the cache. The reason not to do this on the first
  // cache miss (or even when opening the writer) is that it will
  // significantly slow down the case when a taxonomy is opened just to
  // add one category. The idea only spending a long time on reading
  // after enough time was spent on cache misses is known as an "online
  // algorithm".
  perhapsFillCache();
  res = cache.get(categoryPath);
  if (res >= 0 || cacheIsComplete) {
    // if after filling the cache from the info on disk, the category is in it
    // or the cache is complete, return whatever cache.get returned.
    return res;
  }

  // if we get here, it means the category is not in the cache, and it is not
  // complete, and therefore we must look for the category on disk.
  
  // We need to get an answer from the on-disk index.
  initReaderManager();

  int doc = -1;
  DirectoryReader reader = readerManager.acquire();
  try {
    final BytesRef catTerm = new BytesRef(FacetsConfig.pathToString(categoryPath.components, categoryPath.length));
    PostingsEnum docs = null; // reuse
    for (LeafReaderContext ctx : reader.leaves()) {
      Terms terms = ctx.reader().terms(Consts.FULL);
      if (terms != null) {
        // TODO: share per-segment TermsEnum here!
        TermsEnum termsEnum = terms.iterator();
        if (termsEnum.seekExact(catTerm)) {
          // liveDocs=null because the taxonomy has no deletes
          docs = termsEnum.postings(docs, 0 /* freqs not required */);
          // if the term was found, we know it has exactly one document.
          doc = docs.nextDoc() + ctx.docBase;
          break;
        }
      }
    }
  } finally {
    readerManager.release(reader);
  }
  if (doc > 0) {
    addToCache(categoryPath, doc);
  }
  return doc;
}

Source File: FrequencyCtxWindowBasedFBWorker.java From jate with GNU Lesser General Public License v3.0

4 votes

private List<MWEInSentence> collectTermSentenceContext(Terms termVectorLookup,
                                                            Map<Integer, Integer> sentenceBoundaries) throws IOException {
    List<MWEInSentence> result = new ArrayList<>();

    TermsEnum tiRef = termVectorLookup.iterator();
    BytesRef luceneTerm = tiRef.next();
    while (luceneTerm != null) {
        if (luceneTerm.length == 0) {
            luceneTerm = tiRef.next();
            continue;
        }
        String tString = luceneTerm.utf8ToString();
        if (!allCandidates.contains(tString)) {
            luceneTerm = tiRef.next();
            continue;
        }


        PostingsEnum postingsEnum = tiRef.postings(null, PostingsEnum.ALL);
        //PostingsEnum postingsEnum = ti.postings(null, PostingsEnum.OFFSETS);

        int doc = postingsEnum.nextDoc(); //this should be just 1 doc, i.e., the constraint for getting this TV
        if (doc != PostingsEnum.NO_MORE_DOCS) {
            int totalOccurrence = postingsEnum.freq();
            for (int i = 0; i < totalOccurrence; i++) {
                postingsEnum.nextPosition();
                int start = postingsEnum.startOffset();
                int end = postingsEnum.endOffset();
                BytesRef payload = postingsEnum.getPayload();
                SentenceContext sentenceContextInfo = null;
                if (payload != null) {
                    sentenceContextInfo = new SentenceContext(MWEMetadata.deserialize(payload.utf8ToString()));
                }
                if (sentenceContextInfo == null)
                    result.add(new MWEInSentence(tString, start, end, 0, 0, 0));
                else {
                    result.add(new MWEInSentence(tString, start, end,
                            sentenceContextInfo.getFirstTokenIdx(),
                            sentenceContextInfo.getLastTokenIdx(),
                            sentenceContextInfo.getSentenceId()));

                    Integer endBound = sentenceBoundaries.get(sentenceContextInfo.getSentenceId());
                    if (endBound == null || endBound < sentenceContextInfo.getLastTokenIdx())
                        sentenceBoundaries.put(sentenceContextInfo.getSentenceId(),
                                sentenceContextInfo.getLastTokenIdx());
                }
            }
        }
        luceneTerm = tiRef.next();
    }
    Collections.sort(result);
    return result;
}

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  final int maxDoc = reader.maxDoc();

  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final PagedBytes bytes = new PagedBytes(15);

  int startTermsBPV;

  // TODO: use Uninvert?
  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
    } else {
      startTermsBPV = 1;
    }
  } else {
    startTermsBPV = 1;
  }

  PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
  final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);

  int termOrd = 0;

  // TODO: use Uninvert?

  if (terms != null) {
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;

    while(true) {
      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      if (termOrd >= maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        // Store 1+ ord into packed bits
        docToTermOrd.set(docID, 1+termOrd);
      }
      termOrd++;
    }
  }

  // maybe an int-only impl?
  return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}

Source File: VectorScoreQuery.java From solr-vector-scoring with Apache License 2.0

4 votes

@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
	return new CustomScoreProvider(context){
		@Override
		public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
			float score = 0;
			double docVectorNorm = 0;
			LeafReader reader = context.reader();
			Terms terms = reader.getTermVector(docID, field);
			if(vector.size() != terms.size()){
				throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
			}
			TermsEnum iter = terms.iterator();
		    BytesRef text;
		    while ((text = iter.next()) != null) {
		    	String term = text.utf8ToString();
		    	float payloadValue = 0f;
		    	PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
		    	while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
		    		int freq = postings.freq();
		    		while (freq-- > 0) postings.nextPosition();

		    		BytesRef payload = postings.getPayload();
		    		payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 
		    		
		    		if (cosine)
		              docVectorNorm += Math.pow(payloadValue, 2.0);
		    	}
		    		
		    	score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
		    }
		    
		    if (cosine) {
		      if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
		      return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
		    }

			return score;
		}
	};
}

Java Code Examples for org.apache.lucene.index.TermsEnum#postings()