org.apache.lucene.util.PriorityQueue Java Examples
The following examples show how to use
org.apache.lucene.util.PriorityQueue.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CandidateScorer.java From Elasticsearch with Apache License 2.0 | 6 votes |
private void updateTop(CandidateSet[] candidates, Candidate[] path, PriorityQueue<Correction> corrections, double cutoffScore, double score) throws IOException { score = Math.exp(score); assert Math.abs(score - score(path, candidates)) < 0.00001; if (score > cutoffScore) { if (corrections.size() < maxNumCorrections) { Candidate[] c = new Candidate[candidates.length]; System.arraycopy(path, 0, c, 0, path.length); corrections.add(new Correction(score, c)); } else if (corrections.top().compareTo(score, path) < 0) { Correction top = corrections.top(); System.arraycopy(path, 0, top.candidates, 0, path.length); top.score = score; corrections.updateTop(); } } }
Example #2
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 6 votes |
/** * Add to an existing boolean query the More Like This query from this PriorityQueue */ private void addToQuery(PriorityQueue<ScoreTerm> q, BooleanQuery query) { ScoreTerm scoreTerm; float bestScore = -1; while ((scoreTerm = q.pop()) != null) { TermQuery tq = new TermQuery(new Term(scoreTerm.topField, scoreTerm.word)); if (boost) { if (bestScore == -1) { bestScore = (scoreTerm.score); } float myScore = (scoreTerm.score); tq.setBoost(boostFactor * myScore / bestScore); } try { query.add(tq, BooleanClause.Occur.SHOULD); } catch (BooleanQuery.TooManyClauses ignore) { break; } } }
Example #3
Source File: MinShouldMatchSumScorer.java From lucene-solr with Apache License 2.0 | 6 votes |
static long cost(LongStream costs, int numScorers, int minShouldMatch) { // the idea here is the following: a boolean query c1,c2,...cn with minShouldMatch=m // could be rewritten to: // (c1 AND (c2..cn|msm=m-1)) OR (!c1 AND (c2..cn|msm=m)) // if we assume that clauses come in ascending cost, then // the cost of the first part is the cost of c1 (because the cost of a conjunction is // the cost of the least costly clause) // the cost of the second part is the cost of finding m matches among the c2...cn // remaining clauses // since it is a disjunction overall, the total cost is the sum of the costs of these // two parts // If we recurse infinitely, we find out that the cost of a msm query is the sum of the // costs of the num_scorers - minShouldMatch + 1 least costly scorers final PriorityQueue<Long> pq = new PriorityQueue<Long>(numScorers - minShouldMatch + 1) { @Override protected boolean lessThan(Long a, Long b) { return a > b; } }; costs.forEach(pq::insertWithOverflow); return StreamSupport.stream(pq.spliterator(), false).mapToLong(Number::longValue).sum(); }
Example #4
Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0 | 6 votes |
private PriorityQueue<ScoreTerm> retrieveTerms(Map<String, Collection<Object>> field2fieldValues) throws IOException { Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { Collection<Object> fieldValues = field2fieldValues.get(fieldName); if (fieldValues == null) continue; for (Object fieldValue : fieldValues) { if (fieldValue != null) { addTermFrequencies(new StringReader(String.valueOf(fieldValue)), field2termFreqMap, fieldName); } } } return createQueue(field2termFreqMap); }
Example #5
Source File: MinimumShouldMatchIntervalsSource.java From lucene-solr with Apache License 2.0 | 6 votes |
MinimumShouldMatchIntervalIterator(Collection<IntervalIterator> subs, int minShouldMatch) { this.disiQueue = new DisiPriorityQueue(subs.size()); float mc = 0; for (IntervalIterator it : subs) { this.disiQueue.add(new DisiWrapper(it)); mc += it.matchCost(); } this.approximation = new DisjunctionDISIApproximation(disiQueue); this.matchCost = mc; this.minShouldMatch = minShouldMatch; this.proximityQueue = new PriorityQueue<IntervalIterator>(minShouldMatch) { @Override protected boolean lessThan(IntervalIterator a, IntervalIterator b) { return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end()); } }; this.backgroundQueue = new PriorityQueue<IntervalIterator>(subs.size()) { @Override protected boolean lessThan(IntervalIterator a, IntervalIterator b) { return a.end() < b.end() || (a.end() == b.end() && a.start() >= b.start()); } }; }
Example #6
Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField[] fields = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new StringReader(stringValue), field2termFreqMap, fieldName); } } } else { addTermFrequencies(field2termFreqMap, vector, fieldName); } } return createQueue(field2termFreqMap); }
Example #7
Source File: DisjunctionMatchesIterator.java From lucene-solr with Apache License 2.0 | 5 votes |
private DisjunctionMatchesIterator(List<MatchesIterator> matches) throws IOException { queue = new PriorityQueue<MatchesIterator>(matches.size()){ @Override protected boolean lessThan(MatchesIterator a, MatchesIterator b) { return a.startPosition() < b.startPosition() || (a.startPosition() == b.startPosition() && a.endPosition() < b.endPosition()) || (a.startPosition() == b.startPosition() && a.endPosition() == b.endPosition()); } }; for (MatchesIterator mi : matches) { if (mi.next()) { queue.add(mi); } } }
Example #8
Source File: MultiPhraseQuery.java From lucene-solr with Apache License 2.0 | 5 votes |
UnionFullPostingsEnum(List<PostingsEnum> subs) { super(subs); this.posQueue = new PriorityQueue<PostingsAndPosition>(subs.size()) { @Override protected boolean lessThan(PostingsAndPosition a, PostingsAndPosition b) { return a.pos < b.pos; } }; this.subs = new ArrayList<>(); for (PostingsEnum pe : subs) { this.subs.add(new PostingsAndPosition(pe)); } }
Example #9
Source File: DisjunctionScorer.java From lucene-solr with Apache License 2.0 | 5 votes |
private TwoPhase(DocIdSetIterator approximation, float matchCost) { super(approximation); this.matchCost = matchCost; unverifiedMatches = new PriorityQueue<DisiWrapper>(DisjunctionScorer.this.subScorers.size()) { @Override protected boolean lessThan(DisiWrapper a, DisiWrapper b) { return a.matchCost < b.matchCost; } }; }
Example #10
Source File: CommonTermsQueryTest.java From lucene-solr with Apache License 2.0 | 5 votes |
private static List<TermAndFreq> queueToList(PriorityQueue<TermAndFreq> queue) { List<TermAndFreq> terms = new ArrayList<>(); while (queue.size() > 0) { terms.add(queue.pop()); } return terms; }
Example #11
Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * Convenience routine to make it easy to return the most interesting words in a document. * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. * * @param r the source document * @param fieldName field passed to analyzer to use when analyzing the content * @return the most interesting words in the document * @see #retrieveTerms(java.io.Reader, String) * @see #setMaxQueryTerms */ public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList<String> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
Example #12
Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0 | 5 votes |
/** * @see #retrieveInterestingTerms(java.io.Reader, String) */ public String[] retrieveInterestingTerms(int docNum) throws IOException { ArrayList<String> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
Example #13
Source File: CandidateScorer.java From Elasticsearch with Apache License 2.0 | 5 votes |
public void findCandidates(CandidateSet[] candidates, Candidate[] path, int ord, int numMissspellingsLeft, PriorityQueue<Correction> corrections, double cutoffScore, final double pathScore) throws IOException { CandidateSet current = candidates[ord]; if (ord == candidates.length - 1) { path[ord] = current.originalTerm; updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); if (numMissspellingsLeft > 0) { for (int i = 0; i < current.candidates.length; i++) { path[ord] = current.candidates[i]; updateTop(candidates, path, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); } } } else { if (numMissspellingsLeft > 0) { path[ord] = current.originalTerm; findCandidates(candidates, path, ord + 1, numMissspellingsLeft, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); for (int i = 0; i < current.candidates.length; i++) { path[ord] = current.candidates[i]; findCandidates(candidates, path, ord + 1, numMissspellingsLeft - 1, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); } } else { path[ord] = current.originalTerm; findCandidates(candidates, path, ord + 1, 0, corrections, cutoffScore, pathScore + scorer.score(path, candidates, ord, gramSize)); } } }
Example #14
Source File: UnorderedIntervalsSource.java From lucene-solr with Apache License 2.0 | 5 votes |
UnorderedIntervalIterator(List<IntervalIterator> subIterators) { super(subIterators); this.queue = new PriorityQueue<IntervalIterator>(subIterators.size()) { @Override protected boolean lessThan(IntervalIterator a, IntervalIterator b) { return a.start() < b.start() || (a.start() == b.start() && a.end() >= b.end()); } }; this.subIterators = new IntervalIterator[subIterators.size()]; for (int i = 0; i < subIterators.size(); i++) { this.subIterators[i] = subIterators.get(i); } }
Example #15
Source File: QualityQueriesFinder.java From lucene-solr with Apache License 2.0 | 5 votes |
private String [] bestTerms(String field,int numTerms) throws IOException { PriorityQueue<TermDf> pq = new TermsDfQueue(numTerms); IndexReader ir = DirectoryReader.open(dir); try { int threshold = ir.maxDoc() / 10; // ignore words too common. Terms terms = MultiTerms.getTerms(ir, field); if (terms != null) { TermsEnum termsEnum = terms.iterator(); while (termsEnum.next() != null) { int df = termsEnum.docFreq(); if (df<threshold) { String ttxt = termsEnum.term().utf8ToString(); pq.insertWithOverflow(new TermDf(ttxt,df)); } } } } finally { ir.close(); } String res[] = new String[pq.size()]; int i = 0; while (pq.size()>0) { TermDf tdf = pq.pop(); res[i++] = tdf.word; System.out.println(i+". word: "+tdf.df+" "+tdf.word); } return res; }
Example #16
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 5 votes |
/** * Convenience routine to make it easy to return the most interesting words in a document. * More advanced users will call {@link #retrieveTerms(Reader, String) retrieveTerms()} directly. * * @param r the source document * @param fieldName field passed to analyzer to use when analyzing the content * @return the most interesting words in the document * @see #retrieveTerms(java.io.Reader, String) * @see #setMaxQueryTerms */ public String[] retrieveInterestingTerms(Reader r, String fieldName) throws IOException { ArrayList<Object> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(r, fieldName); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
Example #17
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 5 votes |
/** * @see #retrieveInterestingTerms(java.io.Reader, String) */ public String[] retrieveInterestingTerms(int docNum) throws IOException { ArrayList<Object> al = new ArrayList<>(maxQueryTerms); PriorityQueue<ScoreTerm> pq = retrieveTerms(docNum); ScoreTerm scoreTerm; int lim = maxQueryTerms; // have to be careful, retrieveTerms returns all words but that's probably not useful to our caller... // we just want to return the top words while (((scoreTerm = pq.pop()) != null) && lim-- > 0) { al.add(scoreTerm.word); // the 1st entry is the interesting word } String[] res = new String[al.size()]; return al.toArray(res); }
Example #18
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 5 votes |
/** * Find words for a more-like-this query former. * * @param docNum the id of the lucene document from which to find terms */ private PriorityQueue<ScoreTerm> retrieveTerms(int docNum) throws IOException { Map<String, Int> termFreqMap = new HashMap<>(); for (String fieldName : fieldNames) { final Fields vectors = ir.getTermVectors(docNum); final Terms vector; if (vectors != null) { vector = vectors.terms(fieldName); } else { vector = null; } // field does not store term vector info if (vector == null) { Document d = ir.document(docNum); IndexableField fields[] = d.getFields(fieldName); for (IndexableField field : fields) { final String stringValue = field.stringValue(); if (stringValue != null) { addTermFrequencies(new FastStringReader(stringValue), termFreqMap, fieldName); } } } else { addTermFrequencies(termFreqMap, vector, fieldName); } } return createQueue(termFreqMap); }
Example #19
Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0 | 4 votes |
/** * Create a PriorityQueue from a word->tf map. * * @param perFieldTermFrequencies a per field map of words keyed on the word(String) with Int objects as the values. */ private PriorityQueue<ScoreTerm> createQueue(Map<String, Map<String, Int>> perFieldTermFrequencies) throws IOException { // have collected all words in doc and their freqs final int limit = Math.min(maxQueryTerms, this.getTermsCount(perFieldTermFrequencies)); FreqQ queue = new FreqQ(limit); // will order words by score for (Map.Entry<String, Map<String, Int>> entry : perFieldTermFrequencies.entrySet()) { Map<String, Int> perWordTermFrequencies = entry.getValue(); String fieldName = entry.getKey(); long numDocs = ir.getDocCount(fieldName); if(numDocs == -1) { numDocs = ir.numDocs(); } for (Map.Entry<String, Int> tfEntry : perWordTermFrequencies.entrySet()) { // for every word String word = tfEntry.getKey(); int tf = tfEntry.getValue().x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } int docFreq = ir.docFreq(new Term(fieldName, word)); if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, fieldName, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, fieldName, score, idf, docFreq, tf); queue.updateTop(); } } } } return queue; }
Example #20
Source File: TopDocsCollector.java From lucene-solr with Apache License 2.0 | 4 votes |
protected TopDocsCollector(PriorityQueue<T> pq) { this.pq = pq; }
Example #21
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 4 votes |
/** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. * @param fieldNames an array of field names to override defaults. */ private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words, String... fieldNames) throws IOException { // have collected all words in doc and their freqs int numDocs = ir.numDocs(); final int limit = Math.min(maxQueryTerms, words.size()); FreqQ queue = new FreqQ(limit); // will order words by score for (String word : words.keySet()) { // for every word int tf = words.get(word).x; // term freq in the source doc if (minTermFreq > 0 && tf < minTermFreq) { continue; // filter out words that don't occur enough times in the source } // go through all the fields and find the largest document frequency String topField = fieldNames[0]; int docFreq = 0; for (String fieldName : fieldNames) { int freq = ir.docFreq(new Term(fieldName, word)); topField = (freq > docFreq) ? fieldName : topField; docFreq = (freq > docFreq) ? freq : docFreq; } if (minDocFreq > 0 && docFreq < minDocFreq) { continue; // filter out words that don't occur in enough docs } if (docFreq > maxDocFreq) { continue; // filter out words that occur in too many docs } if (docFreq == 0) { continue; // index update problem? } float idf = similarity.idf(docFreq, numDocs); float score = tf * idf; if (queue.size() < limit) { // there is still space in the queue queue.add(new ScoreTerm(word, topField, score, idf, docFreq, tf)); } else { ScoreTerm term = queue.top(); if (term.score < score) { // update the smallest in the queue in place and update the queue. term.update(word, topField, score, idf, docFreq, tf); queue.updateTop(); } } } return queue; }
Example #22
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 4 votes |
/** * Create the More like query from a PriorityQueue */ private Query createQuery(PriorityQueue<ScoreTerm> q) { BooleanQuery query = new BooleanQuery(); addToQuery(q, query); return query; }
Example #23
Source File: JustCompileSearch.java From lucene-solr with Apache License 2.0 | 4 votes |
protected JustCompileTopDocsCollector(PriorityQueue<ScoreDoc> pq) { super(pq); }
Example #24
Source File: RankQueryTestPlugin.java From lucene-solr with Apache License 2.0 | 4 votes |
@SuppressWarnings({"unchecked"}) public TestCollector(@SuppressWarnings({"rawtypes"})PriorityQueue pq) { super(pq); }
Example #25
Source File: RankQueryTestPlugin.java From lucene-solr with Apache License 2.0 | 4 votes |
@SuppressWarnings({"unchecked"}) public TestCollector1(@SuppressWarnings({"rawtypes"})PriorityQueue pq) { super(pq); }
Example #26
Source File: MoreLikeThis.java From lucene-solr with Apache License 2.0 | 2 votes |
/** * Find words for a more-like-this query former. * The result is a priority queue of arrays with one entry for <b>every word</b> in the document. * Each array has 6 elements. * The elements are: * <ol> * <li> The word (String) * <li> The top field that this word comes from (String) * <li> The score for this word (Float) * <li> The IDF value (Float) * <li> The frequency of this word in the index (Integer) * <li> The frequency of this word in the source document (Integer) * </ol> * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. * This method is exposed so that you can identify the "interesting words" in a document. * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. * * @param r the reader that has the content of the document * @param fieldName field passed to the analyzer to use when analyzing the content * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first * @see #retrieveInterestingTerms */ private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException { Map<String, Map<String, Int>> field2termFreqMap = new HashMap<>(); addTermFrequencies(r, field2termFreqMap, fieldName); return createQueue(field2termFreqMap); }
Example #27
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 2 votes |
/** * Find words for a more-like-this query former. * The result is a priority queue of arrays with one entry for <b>every word</b> in the document. * Each array has 6 elements. * The elements are: * <ol> * <li> The word (String) * <li> The top field that this word comes from (String) * <li> The score for this word (Float) * <li> The IDF value (Float) * <li> The frequency of this word in the index (Integer) * <li> The frequency of this word in the source document (Integer) * </ol> * This is a somewhat "advanced" routine, and in general only the 1st entry in the array is of interest. * This method is exposed so that you can identify the "interesting words" in a document. * For an easier method to call see {@link #retrieveInterestingTerms retrieveInterestingTerms()}. * * @param r the reader that has the content of the document * @param fieldName field passed to the analyzer to use when analyzing the content * @return the most interesting words in the document ordered by score, with the highest scoring, or best entry, first * @see #retrieveInterestingTerms */ private PriorityQueue<ScoreTerm> retrieveTerms(Reader r, String fieldName) throws IOException { Map<String, Int> words = new HashMap<>(); addTermFrequencies(r, words, fieldName); return createQueue(words); }
Example #28
Source File: XMoreLikeThis.java From Elasticsearch with Apache License 2.0 | 2 votes |
/** * Create a PriorityQueue from a word->tf map. * * @param words a map of words keyed on the word(String) with Int objects as the values. */ private PriorityQueue<ScoreTerm> createQueue(Map<String, Int> words) throws IOException { return createQueue(words, this.fieldNames); }