Java Code Examples for org.apache.lucene.search.CollectionStatistics#sumTotalTermFreq()
The following examples show how to use
org.apache.lucene.search.CollectionStatistics#sumTotalTermFreq() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: SMARTBNNBNNSimilarity.java From lucene4ir with Apache License 2.0 | 6 votes |
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { float N, n, idf, adl; idf = 1.0f; N = collectionStats.maxDoc(); adl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf = log(N/n); } else { for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf += log(N/n); } } return new TFIDFWeight(collectionStats.field(), idf, adl); }
Example 2
Source File: TermVectorsWriter.java From Elasticsearch with Apache License 2.0 | 5 votes |
private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException { long sttf = fieldStats.sumTotalTermFreq(); assert (sttf >= -1); writePotentiallyNegativeVLong(sttf); long sdf = fieldStats.sumDocFreq(); assert (sdf >= -1); writePotentiallyNegativeVLong(sdf); int dc = (int) fieldStats.docCount(); assert (dc >= -1); writePotentiallyNegativeVInt(dc); }
Example 3
Source File: CollectionStats.java From lucene-solr with Apache License 2.0 | 5 votes |
public CollectionStats(CollectionStatistics stats) { this.field = stats.field(); this.maxDoc = stats.maxDoc(); this.docCount = stats.docCount(); this.sumTotalTermFreq = stats.sumTotalTermFreq(); this.sumDocFreq = stats.sumDocFreq(); }
Example 4
Source File: BM25Similarity.java From lucene4ir with Apache License 2.0 | 5 votes |
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>, * or returns <code>1</code> if the index does not store sumTotalTermFreq: * any field that omits frequency information). */ protected float avgFieldLength(CollectionStatistics collectionStats) { final long sumTotalTermFreq = collectionStats.sumTotalTermFreq(); if (sumTotalTermFreq <= 0) { return 1f; // field does not exist, or stat is unsupported } else { final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount(); return (float) (sumTotalTermFreq / (double) docCount); } }
Example 5
Source File: OKAPIBM25Similarity.java From lucene4ir with Apache License 2.0 | 5 votes |
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { long N, n; float idf_, avdl; idf_ = 1.0f; N = collectionStats.docCount(); if (N == -1) N = collectionStats.maxDoc(); avdl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf_ = idf(n, N); } else { /* computation for a phrase */ for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf_ += idf(n, N); } } return new TFIDFWeight(collectionStats.field(), idf_, avdl); }
Example 6
Source File: DumpTermsApp.java From lucene4ir with Apache License 2.0 | 5 votes |
public void reportCollectionStatistics()throws IOException { IndexSearcher searcher = new IndexSearcher(reader); CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL); long token_count = collectionStats.sumTotalTermFreq(); long doc_count = collectionStats.docCount(); long sum_doc_count = collectionStats.sumDocFreq(); long avg_doc_length = token_count / doc_count; System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE); token_count = collectionStats.sumTotalTermFreq(); doc_count = collectionStats.docCount(); sum_doc_count = collectionStats.sumDocFreq(); avg_doc_length = token_count / doc_count; System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT); token_count = collectionStats.sumTotalTermFreq(); doc_count = collectionStats.docCount(); sum_doc_count = collectionStats.sumDocFreq(); avg_doc_length = token_count / doc_count; System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); }
Example 7
Source File: ExampleStatsApp.java From lucene4ir with Apache License 2.0 | 5 votes |
public void reportCollectionStatistics()throws IOException { IndexSearcher searcher = new IndexSearcher(reader); CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL); long token_count = collectionStats.sumTotalTermFreq(); long doc_count = collectionStats.docCount(); long sum_doc_count = collectionStats.sumDocFreq(); long avg_doc_length = token_count / doc_count; System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE); token_count = collectionStats.sumTotalTermFreq(); doc_count = collectionStats.docCount(); sum_doc_count = collectionStats.sumDocFreq(); avg_doc_length = token_count / doc_count; System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT); token_count = collectionStats.sumTotalTermFreq(); doc_count = collectionStats.docCount(); sum_doc_count = collectionStats.sumDocFreq(); avg_doc_length = token_count / doc_count; System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length); }
Example 8
Source File: BaseSimilarityTestCase.java From lucene-solr with Apache License 2.0 | 4 votes |
/** * returns new random term, that fits within the bounds of the corpus */ static TermStatistics newTerm(Random random, CollectionStatistics corpus) { final long docFreq; switch (random.nextInt(3)) { case 0: // rare term docFreq = 1; break; case 1: // common term docFreq = corpus.docCount(); break; default: // random specificity docFreq = TestUtil.nextLong(random, 1, corpus.docCount()); break; } final long totalTermFreq; // can't require docs to have > 2B tokens long upperBound; try { upperBound = Math.min(corpus.sumTotalTermFreq(), Math.multiplyExact(docFreq, Integer.MAX_VALUE)); } catch (ArithmeticException overflow) { upperBound = corpus.sumTotalTermFreq(); } if (corpus.sumTotalTermFreq() == corpus.sumDocFreq()) { // omitTF totalTermFreq = docFreq; } else { switch (random.nextInt(3)) { case 0: // no repetition totalTermFreq = docFreq; break; case 1: // maximum repetition totalTermFreq = upperBound; break; default: // random repetition totalTermFreq = TestUtil.nextLong(random, docFreq, upperBound); break; } } return new TermStatistics(TERM, docFreq, totalTermFreq); }
Example 9
Source File: BM25Similarity.java From lucene-solr with Apache License 2.0 | 4 votes |
/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code> */ protected float avgFieldLength(CollectionStatistics collectionStats) { return (float) (collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount()); }