org.apache.lucene.search.CollectionStatistics Java Exaples

Source File: DfsSearchResult.java From Elasticsearch with Apache License 2.0

6 votes

public static ObjectObjectHashMap<String, CollectionStatistics> readFieldStats(StreamInput in, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException {
    final int numFieldStatistics = in.readVInt();
    if (fieldStatistics == null) {
        fieldStatistics = HppcMaps.newNoNullKeysMap(numFieldStatistics);
    }
    for (int i = 0; i < numFieldStatistics; i++) {
        final String field = in.readString();
        assert field != null;
        final long maxDoc = in.readVLong();
        final long docCount = subOne(in.readVLong());
        final long sumTotalTermFreq = subOne(in.readVLong());
        final long sumDocFreq = subOne(in.readVLong());
        CollectionStatistics stats = new CollectionStatistics(field, maxDoc, docCount, sumTotalTermFreq, sumDocFreq);
        fieldStatistics.put(field, stats);
    }
    return fieldStatistics;
}

Source File: NormValueSource.java From lucene-solr with Apache License 2.0

6 votes

@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException {
  IndexSearcher searcher = (IndexSearcher)context.get("searcher");
  final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(), field);
  if (similarity == null) {
    throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)");
  }
  // Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf
  // is 1 when docCount == docFreq == 1
  final SimScorer simScorer = similarity.scorer(1f,
      new CollectionStatistics(field, 1, 1, 1, 1),
      new TermStatistics(new BytesRef("bogus"), 1, 1));
  final LeafSimScorer leafSimScorer = new LeafSimScorer(simScorer, readerContext.reader(), field, true);
  
  return new FloatDocValues(this) {
    int lastDocID = -1;
    @Override
    public float floatVal(int docID) throws IOException {
      if (docID < lastDocID) {
        throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID);
      }
      lastDocID = docID;
      return leafSimScorer.score(docID, 1f);
    }
  };
}

Source File: SpanWeight.java From lucene-solr with Apache License 2.0

6 votes

private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException {
  if (termStates == null || termStates.size() == 0 || query.getField() == null)
    return null;
  TermStatistics[] termStats = new TermStatistics[termStates.size()];
  int termUpTo = 0;
  for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) {
    TermStates ts = entry.getValue();
    if (ts.docFreq() > 0) {
      termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq());
    }
  }
  CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField());
  if (termUpTo > 0) {
    return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo));
  } else {
    return null; // no terms at all exist, we won't use similarity
  }
}

Source File: SMARTBNNBNNSimilarity.java From lucene4ir with Apache License 2.0

6 votes

@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
float N, n, idf, adl;
idf = 1.0f;
N   = collectionStats.maxDoc();
adl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n = termStats[0].docFreq();
    idf = log(N/n);
}
else {
    for (final TermStatistics stat : termStats) {
	n = stat.docFreq();
	idf += log(N/n);
    }
}

return new TFIDFWeight(collectionStats.field(), idf, adl);
   }

Source File: AssertingSimilarity.java From lucene-solr with Apache License 2.0

5 votes

@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  assert boost >= 0;
  assert collectionStats != null;
  assert termStats.length > 0;
  for (TermStatistics term : termStats) {
    assert term != null;
  }
  // TODO: check that TermStats is in bounds with respect to collection? e.g. docFreq <= maxDoc
  SimScorer scorer = delegate.scorer(boost, collectionStats, termStats);
  assert scorer != null;
  return new AssertingSimScorer(scorer, boost);
}

Source File: MultiSimilarity.java From lucene-solr with Apache License 2.0

5 votes

@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer subScorers[] = new SimScorer[sims.length];
  for (int i = 0; i < subScorers.length; i++) {
    subScorers[i] = sims[i].scorer(boost, collectionStats, termStats);
  }
  return new MultiSimScorer(subScorers);
}

Source File: LRUStatsCache.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)
    throws IOException {
  CollectionStats colStats = currentGlobalColStats.get(field);
  if (colStats == null) {
    log.debug("## Missing global colStats info: {}, using local", field);
    missingColStats.add(field);
    metrics.missingGlobalFieldStats.increment();
    return localSearcher != null ? localSearcher.localCollectionStatistics(field) : null;
  } else {
    return colStats.toCollectionStatistics();
  }
}

Source File: ExactStatsCache.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CollectionStatistics collectionStatistics(SolrIndexSearcher localSearcher, String field)
    throws IOException {
  CollectionStats colStats = colStatsCache.get(field);
  if (colStats == null) {
    log.debug("Missing global colStats info for field={}, using local", field);
    metrics.missingGlobalFieldStats.increment();
    return localSearcher != null ? localSearcher.localCollectionStatistics(field) : null;
  } else {
    return colStats.toCollectionStatistics();
  }
}

Source File: TestMaxTermFrequency.java From lucene-solr with Apache License 2.0

5 votes

@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  return new SimScorer() {

    @Override
    public float score(float freq, long norm) {
      return 0;
    }

  };
}

Source File: BM25Similarity.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);
  float avgdl = avgFieldLength(collectionStats);

  float[] cache = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl));
  }
  return new BM25Scorer(boost, k1, b, idf, avgdl, cache);
}

Source File: StatsCache.java From lucene-solr with Apache License 2.0

5 votes

@Override
public CollectionStatistics collectionStatistics(String field) throws IOException {
  if (statsSource.collectionStatistics(null, field) == null) {
    missingFieldStats.accept(field);
    missingFieldsCount++;
  }
  return super.collectionStatistics(field);
}

Source File: LMSimilarity.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Computes the collection probability of the current term in addition to the
 * usual statistics.
 */
@Override
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  super.fillBasicStats(stats, collectionStats, termStats);
  LMStats lmStats = (LMStats) stats;
  lmStats.setCollectionProbability(collectionModel.computeProbability(stats));
}

Source File: SimilarityBase.java From lucene-solr with Apache License 2.0

5 votes

/** Fills all member fields defined in {@code BasicStats} in {@code stats}. 
 *  Subclasses can override this method to fill additional stats. */
protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) {
  // TODO: validate this for real, somewhere else
  assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq();
  assert termStats.docFreq() <= collectionStats.sumDocFreq();
 
  // TODO: add sumDocFreq for field (numberOfFieldPostings)
  stats.setNumberOfDocuments(collectionStats.docCount());
  stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq());
  stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount());
  stats.setDocFreq(termStats.docFreq());
  stats.setTotalTermFreq(termStats.totalTermFreq());
}

Source File: SimilarityBase.java From lucene-solr with Apache License 2.0

5 votes

@Override
public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  SimScorer weights[] = new SimScorer[termStats.length];
  for (int i = 0; i < termStats.length; i++) {
    BasicStats stats = newStats(collectionStats.field(), boost);
    fillBasicStats(stats, collectionStats, termStats[i]);
    weights[i] = new BasicSimScorer(stats);
  }
  if (weights.length == 1) {
    return weights[0];
  } else {
    return new MultiSimilarity.MultiSimScorer(weights);
  }
}

Source File: CollectionStats.java From lucene-solr with Apache License 2.0

5 votes

public CollectionStats(CollectionStatistics stats) {
  this.field = stats.field();
  this.maxDoc = stats.maxDoc();
  this.docCount = stats.docCount();
  this.sumTotalTermFreq = stats.sumTotalTermFreq();
  this.sumDocFreq = stats.sumDocFreq();
}

Source File: TestSimilarityBase.java From lucene-solr with Apache License 2.0

5 votes

private CollectionStatistics toCollectionStats(BasicStats stats) {
  long sumTtf = stats.getNumberOfFieldTokens();
  long sumDf;
  if (sumTtf == -1) {
    sumDf = TestUtil.nextLong(random(), stats.getNumberOfDocuments(), 2L * stats.getNumberOfDocuments());
  } else {
    sumDf = TestUtil.nextLong(random(), Math.min(stats.getNumberOfDocuments(), sumTtf), sumTtf);
  }
  int docCount = Math.toIntExact(Math.min(sumDf, stats.getNumberOfDocuments()));
  int maxDoc = TestUtil.nextInt(random(), docCount, docCount + 10);

  return new CollectionStatistics(stats.field, maxDoc, docCount, sumTtf, sumDf);
}

Source File: BM25Similarity.java From lucene4ir with Apache License 2.0

5 votes

/** The default implementation computes the average as <code>sumTotalTermFreq / docCount</code>,
 * or returns <code>1</code> if the index does not store sumTotalTermFreq:
 * any field that omits frequency information). */
protected float avgFieldLength(CollectionStatistics collectionStats) {
  final long sumTotalTermFreq = collectionStats.sumTotalTermFreq();
  if (sumTotalTermFreq <= 0) {
    return 1f;       // field does not exist, or stat is unsupported
  } else {
    final long docCount = collectionStats.docCount() == -1 ? collectionStats.maxDoc() : collectionStats.docCount();
    return (float) (sumTotalTermFreq / (double) docCount);
  }
}

Source File: BM25Similarity.java From lucene4ir with Apache License 2.0

5 votes

@Override
public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) {
  Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats);

  float avgdl = avgFieldLength(collectionStats);

  // compute freq-independent part of bm25 equation across all norm values
  float cache[] = new float[256];
  for (int i = 0; i < cache.length; i++) {
    cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl);
  }
  return new BM25Stats(collectionStats.field(), idf, avgdl, cache);
}

Source File: OKAPIBM25Similarity.java From lucene4ir with Apache License 2.0

5 votes

@Override
   public final SimWeight computeWeight(CollectionStatistics collectionStats,
				 TermStatistics... termStats)
   {
long  N, n;
float idf_, avdl;

idf_ = 1.0f;

N    = collectionStats.docCount();
if (N == -1)
    N = collectionStats.maxDoc();

avdl = collectionStats.sumTotalTermFreq() / N;

if (termStats.length == 1) {
    n    = termStats[0].docFreq();
    idf_ = idf(n, N);
}
else { /* computation for a phrase */
    for (final TermStatistics stat : termStats) {
	n     = stat.docFreq();
	idf_ += idf(n, N);
    }
}

return new TFIDFWeight(collectionStats.field(), idf_, avdl);
   }

Source File: TestMemoryIndex.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testSimilarities() throws IOException {

  MemoryIndex mi = new MemoryIndex();
  mi.addField("f1", "a long text field that contains many many terms", analyzer);

  IndexSearcher searcher = mi.createSearcher();
  LeafReader reader = (LeafReader) searcher.getIndexReader();
  NumericDocValues norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n1 = norms.longValue();

  // Norms are re-computed when we change the Similarity
  mi.setSimilarity(new Similarity() {

    @Override
    public long computeNorm(FieldInvertState state) {
      return 74;
    }

    @Override
    public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
      throw new UnsupportedOperationException();
    }

  });
  norms = reader.getNormValues("f1");
  assertEquals(0, norms.nextDoc());
  float n2 = norms.longValue();

  assertTrue(n1 != n2);
  TestUtil.checkReader(reader);
}

Source File: LindenSimilarity.java From linden with Apache License 2.0

5 votes

@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long max = collectionStats.maxDoc();
  final float idf = idfManager.getIDF(termStats.term().utf8ToString());
  return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")");
}

Source File: TermVectorsWriter.java From Elasticsearch with Apache License 2.0

5 votes

private void writeFieldStatistics(CollectionStatistics fieldStats) throws IOException {
    long sttf = fieldStats.sumTotalTermFreq();
    assert (sttf >= -1);
    writePotentiallyNegativeVLong(sttf);
    long sdf = fieldStats.sumDocFreq();
    assert (sdf >= -1);
    writePotentiallyNegativeVLong(sdf);
    int dc = (int) fieldStats.docCount();
    assert (dc >= -1);
    writePotentiallyNegativeVInt(dc);
}

Source File: DumpTermsApp.java From lucene4ir with Apache License 2.0

5 votes

public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(lucene4ir.Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }

Source File: DfsSearchResult.java From Elasticsearch with Apache License 2.0

5 votes

public static void writeFieldStats(StreamOutput out, ObjectObjectHashMap<String, CollectionStatistics> fieldStatistics) throws IOException {
    out.writeVInt(fieldStatistics.size());

    for (ObjectObjectCursor<String, CollectionStatistics> c : fieldStatistics) {
        out.writeString(c.key);
        CollectionStatistics statistics = c.value;
        assert statistics.maxDoc() >= 0;
        out.writeVLong(statistics.maxDoc());
        out.writeVLong(addOne(statistics.docCount()));
        out.writeVLong(addOne(statistics.sumTotalTermFreq()));
        out.writeVLong(addOne(statistics.sumDocFreq()));
    }
}

Source File: ExampleStatsApp.java From lucene4ir with Apache License 2.0

5 votes

public void reportCollectionStatistics()throws IOException {

        IndexSearcher searcher = new IndexSearcher(reader);

        CollectionStatistics collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_ALL);
        long token_count = collectionStats.sumTotalTermFreq();
        long doc_count = collectionStats.docCount();
        long sum_doc_count = collectionStats.sumDocFreq();
        long avg_doc_length = token_count / doc_count;

        System.out.println("ALL: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_TITLE);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("TITLE: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);


        collectionStats = searcher.collectionStatistics(Lucene4IRConstants.FIELD_CONTENT);
        token_count = collectionStats.sumTotalTermFreq();
        doc_count = collectionStats.docCount();
        sum_doc_count = collectionStats.sumDocFreq();
        avg_doc_length = token_count / doc_count;

        System.out.println("CONTENT: Token count: " + token_count+ " Doc Count: " + doc_count + " sum doc: " + sum_doc_count + " avg doc len: " + avg_doc_length);

    }

Source File: ClassicSimilarity.java From lucene-solr with Apache License 2.0

5 votes

@Override
public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) {
  final long df = termStats.docFreq();
  final long docCount = collectionStats.docCount();
  final float idf = idf(df, docCount);
  return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:",
      Explanation.match(df, "docFreq, number of documents containing term"),
      Explanation.match(docCount, "docCount, total number of documents with field"));
}

Source File: TestOmitTf.java From lucene-solr with Apache License 2.0

4 votes

@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) {
  return Explanation.match(1.0f, "Inexplicable");
}

Source File: TestIndexSorting.java From lucene-solr with Apache License 2.0

4 votes

@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  return in.scorer(boost, collectionStats, termStats);
}

Source File: TestUniqueTermCount.java From lucene-solr with Apache License 2.0

4 votes

@Override
public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
  throw new UnsupportedOperationException();
}

Source File: TFSimilarity.java From lumongo with Apache License 2.0

4 votes

@Override
public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) {
	return new TFSimilarity.BooleanWeight(boost);
}

org.apache.lucene.search.CollectionStatistics Java Examples