org.apache.lucene.search.CollectionStatistics#docCount

Source File: TermVectorsWriter.java From Elasticsearch with Apache License 2.0

5 votes

private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
    long sttf = fieldStats.sumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = fieldStats.sumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);
    int dc = (int) fieldStats.docCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}

Source File: ClassicSimilarity.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
      Explanation.match(df, "docFreq, number of documents containing term"),
      Explanation.match(docCount, "docCount, total number of documents with field"));
}

Source File: CollectionStats.java From lucene-solr with Apache License 2.0

5 votes

public CollectionStats(CollectionStatistics stats) {
  this.field = stats.field();
  this.maxDoc = stats.maxDoc();
  this.docCount = stats.docCount();
  this.sumTotalTermFreq = stats.sumTotalTermFreq();
  this.sumDocFreq = stats.sumDocFreq();
}

Source File: BM25Similarity.java From lucene4ir with Apache License 2.0

5 votes

/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
  if (sumTotalTermFreq <= 0) {
    return 1f;       // field does not exist, or stat is unsupported
  } else {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    return (float) (sumTotalTermFreq / (double) docCount);
  }
}

Source File: OKAPIBM25Similarity.java From lucene4ir with Apache License 2.0

5 votes

@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
long  N, n;
float idf_, avdl;

idf_ = 1.0f;

N    = collectionStats.docCount();
if (N == -1)
    N = collectionStats.maxDoc();

avdl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n    = termStats[0].docFreq();
    idf_ = idf(n, N);
}
else { /* computation for a phrase */
    for (final TermStatistics stat : termStats) {
	n     = stat.docFreq();
	idf_ += idf(n, N);
    }
}

return new TFIDFWeight(collectionStats.field(), idf_, avdl);
   }

Source File: DumpTermsApp.java From lucene4ir with Apache License 2.0

5 votes

public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }

Source File: ExampleStatsApp.java From lucene4ir with Apache License 2.0

5 votes

public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }

Source File: BaseSimilarityTestCase.java From lucene-solr with Apache License 2.0

4 votes

/**
 * returns new random term, that fits within the bounds of the corpus
 */
static TermStatistics newTerm(Random random, CollectionStatistics corpus) {
  final long docFreq;
  switch (random.nextInt(3)) {
    case 0:
      // rare term
      docFreq = 1;
      break;
    case 1:
      // common term
      docFreq = corpus.docCount();
      break;
    default:
      // random specificity
      docFreq = TestUtil.nextLong(random, 1, corpus.docCount());
      break;
  }
  final long totalTermFreq;
  // can't require docs to have > 2B tokens
  long upperBound;
  try {
    upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE));
  } catch (ArithmeticException overflow) {
    upperBound = corpus.sumTotalTermFreq();
  }
  if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) {
    // omitTF
    totalTermFreq = docFreq;
  } else {
    switch (random.nextInt(3)) {
      case 0:
        // no repetition
        totalTermFreq = docFreq;
        break;
      case 1:
        // maximum repetition
        totalTermFreq = upperBound;
        break;
      default:
        // random repetition
        totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound);
        break;
    }
  }
  return new TermStatistics(TERM, docFreq, totalTermFreq);
}

Source File: BM25Similarity.java From lucene-solr with Apache License 2.0

4 votes

/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
}

Source File: BM25Similarity.java From lucene-solr with Apache License 2.0

3 votes

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor.
 * 
 * <p>
 * The default implementation uses:
 * 
 * <pre class="prettyprint">
 * idf(docFreq, docCount);
 * </pre>
 * 
 * Note that {@link CollectionStatistics#docCount()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
 * {@link TermStatistics#docFreq()} is used, and when the latter 
 * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
 * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
 *   
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor 
           and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf, computed as log(1 + (N - n + 0.5) / (n + 0.5)) from:",
      Explanation.match(df, "n, number of documents containing term"),
      Explanation.match(docCount, "N, total number of documents with field"));
}

Source File: TFIDFSimilarity.java From lucene-solr with Apache License 2.0

3 votes

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor.
 * 
 * <p>
 * The default implementation uses:
 * 
 * <pre class="prettyprint">
 * idf(docFreq, docCount);
 * </pre>
 * 
 * Note that {@link CollectionStatistics#docCount()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
 * {@link TermStatistics#docFreq()} is used, and when the latter 
 * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
 * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
 *   
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor 
           and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf(docFreq, docCount)", 
      Explanation.match(df, "docFreq, number of documents containing term"),
      Explanation.match(docCount, "docCount, total number of documents with field"));
}

Source File: BM25Similarity.java From lucene4ir with Apache License 2.0

3 votes

/**
 * Computes a score factor for a simple term and returns an explanation
 * for that score factor.
 * 
 * <p>
 * The default implementation uses:
 * 
 * <pre class="prettyprint">
 * idf(docFreq, docCount);
 * </pre>
 * 
 * Note that {@link CollectionStatistics#docCount()} is used instead of
 * {@link org.apache.lucene.index.IndexReader#numDocs() IndexReader#numDocs()} because also 
 * {@link TermStatistics#docFreq()} is used, and when the latter 
 * is inaccurate, so is {@link CollectionStatistics#docCount()}, and in the same direction.
 * In addition, {@link CollectionStatistics#docCount()} does not skew when fields are sparse.
 *   
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the term
 * @return an Explain object that includes both an idf score factor 
           and an explanation for the term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf(docFreq=" + df + ", docCount=" + docCount + ")");
}

Source File: BM25Similarity.java From lucene4ir with Apache License 2.0

3 votes

/**
 * Computes a score factor for a phrase.
 * 
 * <p>
 * The default implementation sums the idf factor for
 * each term in the phrase.
 * 
 * @param collectionStats collection-level statistics
 * @param termStats term-level statistics for the terms in the phrase
 * @return an Explain object that includes both an idf 
 *         score factor for the phrase and an explanation 
 *         for each term.
 */
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats[]) {
  final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
  float idf = 0.0f;
  List<Explanation> details = new ArrayList<>();
  for (final TermStatistics stat : termStats ) {
    final long df = stat.docFreq();
    final float termIdf = idf(df, docCount);
    details.add(Explanation.match(termIdf, "idf(docFreq=" + df + ", docCount=" + docCount + ")"));
    idf += termIdf;
  }
  return Explanation.match(idf, "idf(), sum of:", details);
}

Java Code Examples for org.apache.lucene.search.CollectionStatistics#docCount()