org.apache.lucene.search.TermStatistics Java Examples
The following examples show how to use
org.apache.lucene.search.TermStatistics.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AggregatedDfs.java From Elasticsearch with Apache License 2.0 | 6 votes |
@Override public void writeTo(final StreamOutput out) throws IOException { out.writeVInt(termStatistics.size()); for (ObjectObjectCursor<Term, TermStatistics> c : termStatistics()) { Term term = (Term) c.key; out.writeString(term.field()); out.writeBytesRef(term.bytes()); TermStatistics stats = (TermStatistics) c.value; out.writeBytesRef(stats.term()); out.writeVLong(stats.docFreq()); out.writeVLong(DfsSearchResult.addOne(stats.totalTermFreq())); } DfsSearchResult.writeFieldStats(out, fieldStatistics); out.writeVLong(maxDoc); }
Example #2
Source File: SMARTBNNBNNSimilarity.java From lucene4ir with Apache License 2.0 | 6 votes |
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { float N, n, idf, adl; idf = 1.0f; N = collectionStats.maxDoc(); adl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf = log(N/n); } else { for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf += log(N/n); } } return new TFIDFWeight(collectionStats.field(), idf, adl); }
Example #3
Source File: DfsSearchResult.java From Elasticsearch with Apache License 2.0 | 6 votes |
public static TermStatistics[] readTermStats(StreamInput in, Term[] terms) throws IOException { int termsStatsSize = in.readVInt(); final TermStatistics[] termStatistics; if (termsStatsSize == 0) { termStatistics = EMPTY_TERM_STATS; } else { termStatistics = new TermStatistics[termsStatsSize]; assert terms.length == termsStatsSize; for (int i = 0; i < termStatistics.length; i++) { BytesRef term = terms[i].bytes(); final long docFreq = in.readVLong(); assert docFreq >= 0; final long totalTermFreq = subOne(in.readVLong()); termStatistics[i] = new TermStatistics(term, docFreq, totalTermFreq); } } return termStatistics; }
Example #4
Source File: SpanWeight.java From lucene-solr with Apache License 2.0 | 6 votes |
private Similarity.SimScorer buildSimWeight(SpanQuery query, IndexSearcher searcher, Map<Term, TermStates> termStates, float boost) throws IOException { if (termStates == null || termStates.size() == 0 || query.getField() == null) return null; TermStatistics[] termStats = new TermStatistics[termStates.size()]; int termUpTo = 0; for (Map.Entry<Term, TermStates> entry : termStates.entrySet()) { TermStates ts = entry.getValue(); if (ts.docFreq() > 0) { termStats[termUpTo++] = searcher.termStatistics(entry.getKey(), ts.docFreq(), ts.totalTermFreq()); } } CollectionStatistics collectionStats = searcher.collectionStatistics(query.getField()); if (termUpTo > 0) { return similarity.scorer(boost, collectionStats, ArrayUtil.copyOfSubArray(termStats, 0, termUpTo)); } else { return null; // no terms at all exist, we won't use similarity } }
Example #5
Source File: NormValueSource.java From lucene-solr with Apache License 2.0 | 6 votes |
@Override public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException { IndexSearcher searcher = (IndexSearcher)context.get("searcher"); final TFIDFSimilarity similarity = IDFValueSource.asTFIDF(searcher.getSimilarity(), field); if (similarity == null) { throw new UnsupportedOperationException("requires a TFIDFSimilarity (such as ClassicSimilarity)"); } // Only works if the contribution of the tf is 1 when the freq is 1 and contribution of the idf // is 1 when docCount == docFreq == 1 final SimScorer simScorer = similarity.scorer(1f, new CollectionStatistics(field, 1, 1, 1, 1), new TermStatistics(new BytesRef("bogus"), 1, 1)); final LeafSimScorer leafSimScorer = new LeafSimScorer(simScorer, readerContext.reader(), field, true); return new FloatDocValues(this) { int lastDocID = -1; @Override public float floatVal(int docID) throws IOException { if (docID < lastDocID) { throw new AssertionError("docs out of order: lastDocID=" + lastDocID + " docID=" + docID); } lastDocID = docID; return leafSimScorer.score(docID, 1f); } }; }
Example #6
Source File: AssertingSimilarity.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { assert boost >= 0; assert collectionStats != null; assert termStats.length > 0; for (TermStatistics term : termStats) { assert term != null; } // TODO: check that TermStats is in bounds with respect to collection? e.g. docFreq <= maxDoc SimScorer scorer = delegate.scorer(boost, collectionStats, termStats); assert scorer != null; return new AssertingSimScorer(scorer, boost); }
Example #7
Source File: TestMaxTermFrequency.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { return new SimScorer() { @Override public float score(float freq, long norm) { return 0; } }; }
Example #8
Source File: ClassicSimilarity.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long docCount = collectionStats.docCount(); final float idf = idf(df, docCount); return Explanation.match(idf, "idf, computed as log((docCount+1)/(docFreq+1)) + 1 from:", Explanation.match(df, "docFreq, number of documents containing term"), Explanation.match(docCount, "docCount, total number of documents with field")); }
Example #9
Source File: LMSimilarity.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Computes the collection probability of the current term in addition to the * usual statistics. */ @Override protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { super.fillBasicStats(stats, collectionStats, termStats); LMStats lmStats = (LMStats) stats; lmStats.setCollectionProbability(collectionModel.computeProbability(stats)); }
Example #10
Source File: MultiSimilarity.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { SimScorer subScorers[] = new SimScorer[sims.length]; for (int i = 0; i < subScorers.length; i++) { subScorers[i] = sims[i].scorer(boost, collectionStats, termStats); } return new MultiSimScorer(subScorers); }
Example #11
Source File: BM25Similarity.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); float[] cache = new float[256]; for (int i = 0; i < cache.length; i++) { cache[i] = 1f / (k1 * ((1 - b) + b * LENGTH_TABLE[i] / avgdl)); } return new BM25Scorer(boost, k1, b, idf, avgdl, cache); }
Example #12
Source File: SimilarityBase.java From lucene-solr with Apache License 2.0 | 5 votes |
/** Fills all member fields defined in {@code BasicStats} in {@code stats}. * Subclasses can override this method to fill additional stats. */ protected void fillBasicStats(BasicStats stats, CollectionStatistics collectionStats, TermStatistics termStats) { // TODO: validate this for real, somewhere else assert termStats.totalTermFreq() <= collectionStats.sumTotalTermFreq(); assert termStats.docFreq() <= collectionStats.sumDocFreq(); // TODO: add sumDocFreq for field (numberOfFieldPostings) stats.setNumberOfDocuments(collectionStats.docCount()); stats.setNumberOfFieldTokens(collectionStats.sumTotalTermFreq()); stats.setAvgFieldLength(collectionStats.sumTotalTermFreq() / (double) collectionStats.docCount()); stats.setDocFreq(termStats.docFreq()); stats.setTotalTermFreq(termStats.totalTermFreq()); }
Example #13
Source File: SimilarityBase.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public final SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { SimScorer weights[] = new SimScorer[termStats.length]; for (int i = 0; i < termStats.length; i++) { BasicStats stats = newStats(collectionStats.field(), boost); fillBasicStats(stats, collectionStats, termStats[i]); weights[i] = new BasicSimScorer(stats); } if (weights.length == 1) { return weights[0]; } else { return new MultiSimilarity.MultiSimScorer(weights); } }
Example #14
Source File: LRUStatsCache.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq) throws IOException { TermStats termStats = currentGlobalTermStats.get(term.toString()); if (termStats == null) { log.debug("## Missing global termStats info: {}, using local", term); missingTermStats.add(term); metrics.missingGlobalTermStats.increment(); return localSearcher != null ? localSearcher.localTermStatistics(term, docFreq, totalTermFreq) : null; } else { return termStats.toTermStatistics(); } }
Example #15
Source File: ExactStatsCache.java From lucene-solr with Apache License 2.0 | 5 votes |
public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq) throws IOException { TermStats termStats = termStatsCache.get(term.toString()); // TermStats == null is also true if term has no docFreq anyway, // see returnLocalStats, if docFreq == 0, they are not added anyway // Not sure we need a warning here if (termStats == null) { log.debug("Missing global termStats info for term={}, using local stats", term); metrics.missingGlobalTermStats.increment(); return localSearcher != null ? localSearcher.localTermStatistics(term, docFreq, totalTermFreq) : null; } else { return termStats.toTermStatistics(); } }
Example #16
Source File: TestMemoryIndex.java From lucene-solr with Apache License 2.0 | 5 votes |
@Test public void testSimilarities() throws IOException { MemoryIndex mi = new MemoryIndex(); mi.addField("f1", "a long text field that contains many many terms", analyzer); IndexSearcher searcher = mi.createSearcher(); LeafReader reader = (LeafReader) searcher.getIndexReader(); NumericDocValues norms = reader.getNormValues("f1"); assertEquals(0, norms.nextDoc()); float n1 = norms.longValue(); // Norms are re-computed when we change the Similarity mi.setSimilarity(new Similarity() { @Override public long computeNorm(FieldInvertState state) { return 74; } @Override public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); } }); norms = reader.getNormValues("f1"); assertEquals(0, norms.nextDoc()); float n2 = norms.longValue(); assertTrue(n1 != n2); TestUtil.checkReader(reader); }
Example #17
Source File: AggregatedDfs.java From Elasticsearch with Apache License 2.0 | 5 votes |
@Override public void readFrom(StreamInput in) throws IOException { int size = in.readVInt(); termStatistics = HppcMaps.newMap(size); for (int i = 0; i < size; i++) { Term term = new Term(in.readString(), in.readBytesRef()); TermStatistics stats = new TermStatistics(in.readBytesRef(), in.readVLong(), DfsSearchResult.subOne(in.readVLong())); termStatistics.put(term, stats); } fieldStatistics = DfsSearchResult.readFieldStats(in); maxDoc = in.readVLong(); }
Example #18
Source File: LindenSimilarity.java From linden with Apache License 2.0 | 5 votes |
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics termStats) { final long df = termStats.docFreq(); final long max = collectionStats.maxDoc(); final float idf = idfManager.getIDF(termStats.term().utf8ToString()); return new Explanation(idf, "idf(docFreq=" + df + ", maxDocs=" + max + ")"); }
Example #19
Source File: TermVectorsWriter.java From Elasticsearch with Apache License 2.0 | 5 votes |
private void writeTermStatistics(TermStatistics termStatistics) throws IOException { int docFreq = (int) termStatistics.docFreq(); assert (docFreq >= -1); writePotentiallyNegativeVInt(docFreq); long ttf = termStatistics.totalTermFreq(); assert (ttf >= -1); writePotentiallyNegativeVLong(ttf); }
Example #20
Source File: StatsCache.java From lucene-solr with Apache License 2.0 | 5 votes |
@Override public TermStatistics termStatistics(Term term, int docFreq, long totalTermFreq) throws IOException { if (statsSource.termStatistics(null, term, docFreq, totalTermFreq) == null) { missingTermStats.accept(term); missingTermsCount++; } return super.termStatistics(term, docFreq, totalTermFreq); }
Example #21
Source File: OKAPIBM25Similarity.java From lucene4ir with Apache License 2.0 | 5 votes |
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { long N, n; float idf_, avdl; idf_ = 1.0f; N = collectionStats.docCount(); if (N == -1) N = collectionStats.maxDoc(); avdl = collectionStats.sumTotalTermFreq() / N; if (termStats.length == 1) { n = termStats[0].docFreq(); idf_ = idf(n, N); } else { /* computation for a phrase */ for (final TermStatistics stat : termStats) { n = stat.docFreq(); idf_ += idf(n, N); } } return new TFIDFWeight(collectionStats.field(), idf_, avdl); }
Example #22
Source File: BM25Similarity.java From lucene4ir with Apache License 2.0 | 5 votes |
@Override public final SimWeight computeWeight(CollectionStatistics collectionStats, TermStatistics... termStats) { Explanation idf = termStats.length == 1 ? idfExplain(collectionStats, termStats[0]) : idfExplain(collectionStats, termStats); float avgdl = avgFieldLength(collectionStats); // compute freq-independent part of bm25 equation across all norm values float cache[] = new float[256]; for (int i = 0; i < cache.length; i++) { cache[i] = k1 * ((1 - b) + b * decodeNormValue((byte)i) / avgdl); } return new BM25Stats(collectionStats.field(), idf, avgdl, cache); }
Example #23
Source File: TFSimilarity.java From lumongo with Apache License 2.0 | 4 votes |
@Override public SimWeight computeWeight(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { return new TFSimilarity.BooleanWeight(boost); }
Example #24
Source File: StatsSource.java From lucene-solr with Apache License 2.0 | 4 votes |
public abstract TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq) throws IOException;
Example #25
Source File: TestFieldInvertState.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); }
Example #26
Source File: TestNorms.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); }
Example #27
Source File: TestUniqueTermCount.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { throw new UnsupportedOperationException(); }
Example #28
Source File: TestOmitTf.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public Explanation idfExplain(CollectionStatistics collectionStats, TermStatistics[] termStats) { return Explanation.match(1.0f, "Inexplicable"); }
Example #29
Source File: TestIndexSorting.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public SimScorer scorer(float boost, CollectionStatistics collectionStats, TermStatistics... termStats) { return in.scorer(boost, collectionStats, termStats); }
Example #30
Source File: LocalStatsSource.java From lucene-solr with Apache License 2.0 | 4 votes |
@Override public TermStatistics termStatistics(SolrIndexSearcher localSearcher, Term term, int docFreq, long totalTermFreq) throws IOException { metrics.missingGlobalTermStats.increment(); return localSearcher.localTermStatistics(term, docFreq, totalTermFreq); }