org.apache.lucene.search.TermStatistics Java Examples

The following examples show how to use org.apache.lucene.search.TermStatistics. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AggregatedDfs.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
@Override
public void writeTo(final StreamOutput out) throws IOException {
    out.writeVInt(termStatistics.size());
    
    for (ObjectObjectCursor<Term, TermStatistics> c : termStatistics()) {
        Term term = (Term) c.key;
        out.writeString(term.field());
        out.writeBytesRef(term.bytes());
        TermStatistics stats = (TermStatistics) c.value;
        out.writeBytesRef(stats.term());
        out.writeVLong(stats.docFreq());
        out.writeVLong(DfsSearchResult.addOne(stats.totalTermFreq()));
    }

    DfsSearchResult.writeFieldStats(out, fieldStatistics);
    out.writeVLong(maxDoc);
}
 
Example #2
Source File: SMARTBNNBNNSimilarity.java    From lucene4ir with Apache License 2.0 6 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
float N, n, idf, adl;
idf = 1.0f;
N   = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n = termStats[0].docFreq();
    idf = log(N/n);
}
else {
    for (final TermStatistics stat : termStats) {
	n = stat.docFreq();
	idf += log(N/n);
    }
}

return new TFIDFWeight(collectionStats.field(), idf, adl);
   }
 
Example #3
Source File: DfsSearchResult.java    From Elasticsearch with Apache License 2.0 6 votes vote down vote up
public static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOException {
    int termsStatsSize = in.readVInt();
    final TermStatistics[] termStatistics;
    if (termsStatsSize == 0) {
        termStatistics = EMPTY_TERM_STATS;
    } else {
        termStatistics = new TermStatistics[termsStatsSize];
        assert terms.length == termsStatsSize;
        for (int i = 0; i < termStatistics.length; i++) {
            BytesRef term = terms[i].bytes();
            final long docFreq = in.readVLong();
            assert docFreq >= 0;
            final long totalTermFreq = subOne(in.readVLong());
            termStatistics[i] = new TermStatistics(term, docFreq, totalTermFreq);
        }
    }
    return termStatistics;
}
 
Example #4
Source File: SpanWeight.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
  if (termStates == null || termStates.size() == 0 || query.getField() == null)
    return null;
  TermStatistics[] termStats = new TermStatistics[termStates.size()];
  int termUpTo = 0;
  for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
    TermStates ts = entry.getValue();
    if (ts.docFreq() > 0) {
      termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
    }
  }
  CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
  if (termUpTo > 0) {
    return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
  } else {
    return null; // no terms at all exist, we won't use similarity
  }
}
 
Example #5
Source File: NormValueSource.java    From lucene-solr with Apache License 2.0 6 votes vote down vote up
@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException {
  IndexSearcher searcher = (IndexSearcher)context.get("searcher");
  final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(), field);
  if (similarity == null) {
    throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
  }
  // Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
  // is 1 when docCount == docFreq == 1
  final SimScorer simScorer = similarity.scorer(1f,
      new CollectionStatistics(field, 1, 1, 1, 1),
      new TermStatistics(new BytesRef("bogus"), 1, 1));
  final LeafSimScorer leafSimScorer = new LeafSimScorer(simScorer, readerContext.reader(), field, true);
  
  return new FloatDocValues(this) {
    int lastDocID = -1;
    @Override
    public float floatVal(int docID) throws IOException {
      if (docID < lastDocID) {
        throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
      }
      lastDocID = docID;
      return leafSimScorer.score(docID, 1f);
    }
  };
}
 
Example #6
Source File: AssertingSimilarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  assert boost >= 0;
  assert collectionStats != null;
  assert termStats.length > 0;
  for (TermStatistics term : termStats) {
    assert term != null;
  }
  // TODO: check that TermStats is in bounds with respect to collection? e.g. docFreq <= maxDoc
  SimScorer scorer = delegate.scorer(boost, collectionStats, termStats);
  assert scorer != null;
  return new AssertingSimScorer(scorer, boost);
}
 
Example #7
Source File: TestMaxTermFrequency.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  return new SimScorer() {

    @Override
    public float score(float freq, long norm) {
      return 0;
    }

  };
}
 
Example #8
Source File: ClassicSimilarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
      Explanation.match(df, "docFreq, number of documents containing term"),
      Explanation.match(docCount, "docCount, total number of documents with field"));
}
 
Example #9
Source File: LMSimilarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/**
 * Computes the collection probability of the current term in addition to the
 * usual statistics.
 */
@Override
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  super.fillBasicStats(stats, collectionStats, termStats);
  LMStats lmStats = (LMStats) stats;
  lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}
 
Example #10
Source File: MultiSimilarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer subScorers[] = new SimScorer[sims.length];
  for (int i = 0; i < subScorers.length; i++) {
    subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
  }
  return new MultiSimScorer(subScorers);
}
 
Example #11
Source File: BM25Similarity.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
  float avgdl = avgFieldLength(collectionStats);

  float[] cache = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
  }
  return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
}
 
Example #12
Source File: SimilarityBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
/** Fills all member fields defined in {@code BasicStats} in {@code stats}. 
 *  Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  // TODO: validate this for real, somewhere else
  assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
  assert termStats.docFreq() <= collectionStats.sumDocFreq();
 
  // TODO: add sumDocFreq for field (numberOfFieldPostings)
  stats.setNumberOfDocuments(collectionStats.docCount());
  stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
  stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
  stats.setDocFreq(termStats.docFreq());
  stats.setTotalTermFreq(termStats.totalTermFreq());
}
 
Example #13
Source File: SimilarityBase.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer weights[] = new SimScorer[termStats.length];
  for (int i = 0; i < termStats.length; i++) {
    BasicStats stats = newStats(collectionStats.field(), boost);
    fillBasicStats(stats, collectionStats, termStats[i]);
    weights[i] = new BasicSimScorer(stats);
  }
  if (weights.length == 1) {
    return weights[0];
  } else {
    return new MultiSimilarity.MultiSimScorer(weights);
  }
}
 
Example #14
Source File: LRUStatsCache.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
    throws IOException {
  TermStats termStats = currentGlobalTermStats.get(term.toString());
  if (termStats == null) {
    log.debug("## Missing global termStats info: {}, using local", term);
    missingTermStats.add(term);
    metrics.missingGlobalTermStats.increment();
    return localSearcher != null ? localSearcher.localTermStatistics(term, docFreq, totalTermFreq) : null;
  } else {
    return termStats.toTermStatistics();
  }
}
 
Example #15
Source File: ExactStatsCache.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
    throws IOException {
  TermStats termStats = termStatsCache.get(term.toString());
  // TermStats == null is also true if term has no docFreq anyway,
  // see returnLocalStats, if docFreq == 0, they are not added anyway
  // Not sure we need a warning here
  if (termStats == null) {
    log.debug("Missing global termStats info for term={}, using local stats", term);
    metrics.missingGlobalTermStats.increment();
    return localSearcher != null ? localSearcher.localTermStatistics(term, docFreq, totalTermFreq) : null;
  } else {
    return termStats.toTermStatistics();
  }
}
 
Example #16
Source File: TestMemoryIndex.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Test
public void testSimilarities() throws IOException {

  MemoryIndex mi = new MemoryIndex();
  mi.addField("f1", "a long text field that contains many many terms", analyzer);

  IndexSearcher searcher = mi.createSearcher();
  LeafReader reader = (LeafReader) searcher.getIndexReader();
  NumericDocValues norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n1 = norms.longValue();

  // Norms are re-computed when we change the Similarity
  mi.setSimilarity(new Similarity() {

    @Override
    public long computeNorm(FieldInvertState state) {
      return 74;
    }

    @Override
    public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
      throw new UnsupportedOperationException();
    }

  });
  norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n2 = norms.longValue();

  assertTrue(n1 != n2);
  TestUtil.checkReader(reader);
}
 
Example #17
Source File: AggregatedDfs.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
@Override
public void readFrom(StreamInput in) throws IOException {
    int size = in.readVInt();
    termStatistics = HppcMaps.newMap(size);
    for (int i = 0; i < size; i++) {
        Term term = new Term(in.readString(), in.readBytesRef());
        TermStatistics stats = new TermStatistics(in.readBytesRef(), 
                in.readVLong(), 
                DfsSearchResult.subOne(in.readVLong()));
        termStatistics.put(term, stats);
    }
    fieldStatistics = DfsSearchResult.readFieldStats(in);
    maxDoc = in.readVLong();
}
 
Example #18
Source File: LindenSimilarity.java    From linden with Apache License 2.0 5 votes vote down vote up
@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long max = collectionStats.maxDoc();
  final float idf = idfManager.getIDF(termStats.term().utf8ToString());
  return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}
 
Example #19
Source File: TermVectorsWriter.java    From Elasticsearch with Apache License 2.0 5 votes vote down vote up
private void writeTermStatistics(TermStatistics termStatistics) throws IOException {
    int docFreq = (int) termStatistics.docFreq();
    assert (docFreq >= -1);
    writePotentiallyNegativeVInt(docFreq);
    long ttf = termStatistics.totalTermFreq();
    assert (ttf >= -1);
    writePotentiallyNegativeVLong(ttf);
}
 
Example #20
Source File: StatsCache.java    From lucene-solr with Apache License 2.0 5 votes vote down vote up
@Override
public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) throws IOException {
  if (statsSource.termStatistics(null, term, docFreq, totalTermFreq) == null) {
    missingTermStats.accept(term);
    missingTermsCount++;
  }
  return super.termStatistics(term, docFreq, totalTermFreq);
}
 
Example #21
Source File: OKAPIBM25Similarity.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
long  N, n;
float idf_, avdl;

idf_ = 1.0f;

N    = collectionStats.docCount();
if (N == -1)
    N = collectionStats.maxDoc();

avdl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n    = termStats[0].docFreq();
    idf_ = idf(n, N);
}
else { /* computation for a phrase */
    for (final TermStatistics stat : termStats) {
	n     = stat.docFreq();
	idf_ += idf(n, N);
    }
}

return new TFIDFWeight(collectionStats.field(), idf_, avdl);
   }
 
Example #22
Source File: BM25Similarity.java    From lucene4ir with Apache License 2.0 5 votes vote down vote up
@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);

  float avgdl = avgFieldLength(collectionStats);

  // compute freq-independent part of bm25 equation across all norm values
  float cache[] = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
  }
  return new BM25Stats(collectionStats.field(), idf, avgdl, cache);
}
 
Example #23
Source File: TFSimilarity.java    From lumongo with Apache License 2.0 4 votes vote down vote up
@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
	return new TFSimilarity.BooleanWeight(boost);
}
 
Example #24
Source File: StatsSource.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
public abstract TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
throws IOException;
 
Example #25
Source File: TestFieldInvertState.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  throw new UnsupportedOperationException();
}
 
Example #26
Source File: TestNorms.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  throw new UnsupportedOperationException();
}
 
Example #27
Source File: TestUniqueTermCount.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  throw new UnsupportedOperationException();
}
 
Example #28
Source File: TestOmitTf.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
  return Explanation.match(1.0f, "Inexplicable");
}
 
Example #29
Source File: TestIndexSorting.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  return in.scorer(boost, collectionStats, termStats);
}
 
Example #30
Source File: LocalStatsSource.java    From lucene-solr with Apache License 2.0 4 votes vote down vote up
@Override
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq)
    throws IOException {
  metrics.missingGlobalTermStats.increment();
  return localSearcher.localTermStatistics(term, docFreq, totalTermFreq);
}