org.apache.lucene.index.Terms#size

Source File: DocToDoubleVectorUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms   term vectors for a given document
 * @param fieldTerms field term vectors
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
  TermsEnum fieldTermsEnum = fieldTerms.iterator();
  Double[] freqVector = null;
  if (docTerms != null && fieldTerms.size() > -1) {
    freqVector = new Double[(int) fieldTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();
    BytesRef term;
    while ((term = fieldTermsEnum.next()) != null) {
      TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
      if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
        docTermsEnum = docTerms.iterator();
      }
      if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
        long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
        freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      } else {
        freqVector[i] = 0d;
      }
      i++;
    }
  }
  return freqVector;
}

Source File: DocToDoubleVectorUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * create a dense <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms term vectors for a given document
 * @return a dense vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toDenseLocalFreqDoubleArray(Terms docTerms) throws IOException {
  Double[] freqVector = null;
  if (docTerms != null) {
    freqVector = new Double[(int) docTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();

    while (docTermsEnum.next() != null) {
      long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
      freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      i++;
    }
  }
  return freqVector;
}

Source File: GroupByOptimizedIterator.java From crate with Apache License 2.0

6 votes

static boolean hasHighCardinalityRatio(Supplier<Engine.Searcher> acquireSearcher, String fieldName) {
    // acquire separate searcher:
    // Can't use sharedShardContexts() yet, if we bail out the "getOrCreateContext" causes issues later on in the fallback logic
    try (Engine.Searcher searcher = acquireSearcher.get()) {
        for (LeafReaderContext leaf : searcher.reader().leaves()) {
            Terms terms = leaf.reader().terms(fieldName);
            if (terms == null) {
                return true;
            }
            double cardinalityRatio = terms.size() / (double) leaf.reader().numDocs();
            if (cardinalityRatio > CARDINALITY_RATIO_THRESHOLD) {
                return true;
            }
        }
    } catch (IOException e) {
        return true;
    }
    return false;
}

Source File: WordScorer.java From Elasticsearch with Apache License 2.0

5 votes

public WordScorer(IndexReader reader, Terms terms, String field, double realWordLikelyHood, BytesRef separator) throws IOException {
    this.field = field;
    if (terms == null) {
        throw new IllegalArgumentException("Field: [" + field + "] does not exist");
    }
    this.terms = terms;
    final long vocSize = terms.getSumTotalTermFreq();
    this.vocabluarySize =  vocSize == -1 ? reader.maxDoc() : vocSize;
    this.useTotalTermFreq = vocSize != -1;
    this.numTerms = terms.size();
    this.termsEnum = new FreqTermsEnum(reader, field, !useTotalTermFreq, useTotalTermFreq, null, BigArrays.NON_RECYCLING_INSTANCE); // non recycling for now
    this.reader = reader;
    this.realWordLikelyhood = realWordLikelyHood;
    this.separator = separator;
}

Source File: TermVectorsResponse.java From Elasticsearch with Apache License 2.0

5 votes

private void buildField(XContentBuilder builder, final CharsRefBuilder spare, Fields theFields, Iterator<String> fieldIter) throws IOException {
    String fieldName = fieldIter.next();
    builder.startObject(fieldName);
    Terms curTerms = theFields.terms(fieldName);
    // write field statistics
    buildFieldStatistics(builder, curTerms);
    builder.startObject(FieldStrings.TERMS);
    TermsEnum termIter = curTerms.iterator();
    BoostAttribute boostAtt = termIter.attributes().addAttribute(BoostAttribute.class);
    for (int i = 0; i < curTerms.size(); i++) {
        buildTerm(builder, spare, curTerms, termIter, boostAtt);
    }
    builder.endObject();
    builder.endObject();
}

Source File: DocToDoubleVectorUtilsTest.java From lucene-solr with Apache License 2.0

5 votes

@Test
public void testSparseFreqDoubleArrayConversion() throws Exception {
  Terms fieldTerms = MultiTerms.getTerms(index, "text");
  if (fieldTerms != null && fieldTerms.size() != -1) {
    IndexSearcher indexSearcher = new IndexSearcher(index);
    for (ScoreDoc scoreDoc : indexSearcher.search(new MatchAllDocsQuery(), Integer.MAX_VALUE).scoreDocs) {
      Terms docTerms = index.getTermVector(scoreDoc.doc, "text");
      Double[] vector = DocToDoubleVectorUtils.toSparseLocalFreqDoubleArray(docTerms, fieldTerms);
      assertNotNull(vector);
      assertTrue(vector.length > 0);
    }
  }
}

Source File: TestBlockPostingsFormat3.java From lucene-solr with Apache License 2.0

5 votes

/** 
 * checks collection-level statistics on Terms 
 */
public void assertTermsStatistics(Terms leftTerms, Terms rightTerms) throws Exception {
  assertEquals(leftTerms.getDocCount(), rightTerms.getDocCount());
  assertEquals(leftTerms.getSumDocFreq(), rightTerms.getSumDocFreq());
  if (leftTerms.hasFreqs() && rightTerms.hasFreqs()) {
    assertEquals(leftTerms.getSumTotalTermFreq(), rightTerms.getSumTotalTermFreq());
  }
  if (leftTerms.size() != -1 && rightTerms.size() != -1) {
    assertEquals(leftTerms.size(), rightTerms.size());
  }
}

Source File: TermsSet.java From lucene4ir with Apache License 2.0

5 votes

private Set<String> getTerms(IndexReader ir) {
    Set<String> t = new HashSet<>();
    for (int i = 0; i < ir.leaves().size(); i++) {
        Terms termsList;
        try {
            // Get all the terms at this level of the tree.
            termsList = ir.leaves().get(i).reader().terms(Lucene4IRConstants.FIELD_ALL);
            if (termsList != null && termsList.size() > 0) {
                TermsEnum te = termsList.iterator();
                BytesRef termBytes;
                while ((termBytes = te.next()) != null) {
                    t.add(termBytes.utf8ToString());
                }
            }

            // Get all the terms at the next level of the tree.
            if (ir.leaves().get(i).children() != null && ir.leaves().get(i).children().size() > 0) {
                for (IndexReaderContext c : ir.leaves().get(i).children()) {
                    t.addAll(getTerms(c.reader()));
                }
            }

        } catch (IOException e) {
            e.printStackTrace();
        }
    }

    return t;
}

Source File: LookupBuilderReducer.java From incubator-retired-blur with Apache License 2.0

5 votes

private long getTotalNumberOfRowIds(DirectoryReader reader) throws IOException {
  long total = 0;
  List<AtomicReaderContext> leaves = reader.leaves();
  for (AtomicReaderContext context : leaves) {
    AtomicReader atomicReader = context.reader();
    Terms terms = atomicReader.terms(BlurConstants.ROW_ID);
    long expectedInsertions = terms.size();
    if (expectedInsertions < 0) {
      return -1;
    }
    total += expectedInsertions;
  }
  return total;
}

Source File: VectorScoreQuery.java From solr-vector-scoring with Apache License 2.0

4 votes

@Override
protected CustomScoreProvider getCustomScoreProvider(LeafReaderContext context) throws IOException {
	return new CustomScoreProvider(context){
		@Override
		public float customScore(int docID, float subQueryScore, float valSrcScore) throws IOException {
			float score = 0;
			double docVectorNorm = 0;
			LeafReader reader = context.reader();
			Terms terms = reader.getTermVector(docID, field);
			if(vector.size() != terms.size()){
				throw new SolrException(SolrException.ErrorCode.BAD_REQUEST, "indexed and input vector array must have same length");
			}
			TermsEnum iter = terms.iterator();
		    BytesRef text;
		    while ((text = iter.next()) != null) {
		    	String term = text.utf8ToString();
		    	float payloadValue = 0f;
		    	PostingsEnum postings = iter.postings(null, PostingsEnum.ALL);
		    	while (postings.nextDoc() != DocIdSetIterator.NO_MORE_DOCS) {
		    		int freq = postings.freq();
		    		while (freq-- > 0) postings.nextPosition();

		    		BytesRef payload = postings.getPayload();
		    		payloadValue = PayloadHelper.decodeFloat(payload.bytes, payload.offset); 
		    		
		    		if (cosine)
		              docVectorNorm += Math.pow(payloadValue, 2.0);
		    	}
		    		
		    	score = (float)(score + payloadValue * (vector.get(Integer.parseInt(term))));
		    }
		    
		    if (cosine) {
		      if ((docVectorNorm == 0) || (queryVectorNorm == 0)) return 0f;
		      return (float)(score / (Math.sqrt(docVectorNorm) * Math.sqrt(queryVectorNorm)));
		    }

			return score;
		}
	};
}

Source File: PhraseWildcardQuery.java From lucene-solr with Apache License 2.0

4 votes

private long getTermsSize(LeafReaderContext leafReaderContext) throws IOException {
  Terms terms = leafReaderContext.reader().terms(field);
  return terms == null ? 0 : terms.size();
}

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  final int maxDoc = reader.maxDoc();

  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final PagedBytes bytes = new PagedBytes(15);

  int startTermsBPV;

  // TODO: use Uninvert?
  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
    } else {
      startTermsBPV = 1;
    }
  } else {
    startTermsBPV = 1;
  }

  PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
  final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);

  int termOrd = 0;

  // TODO: use Uninvert?

  if (terms != null) {
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;

    while(true) {
      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      if (termOrd >= maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        // Store 1+ ord into packed bits
        docToTermOrd.set(docID, 1+termOrd);
      }
      termOrd++;
    }
  }

  // maybe an int-only impl?
  return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  // TODO: would be nice to first check if DocTermsIndex
  // was already cached for this field and then return
  // that instead, to avoid insanity

  final int maxDoc = reader.maxDoc();
  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final int termCountHardLimit = maxDoc;

  // Holds the actual term data, expanded.
  final PagedBytes bytes = new PagedBytes(15);

  int startBPV;

  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > termCountHardLimit) {
        numUniqueTerms = termCountHardLimit;
      }
      startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
    } else {
      startBPV = 1;
    }
  } else {
    startBPV = 1;
  }

  final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
  
  // pointer==0 means not set
  bytes.copyUsingLengthPrefix(new BytesRef());

  if (terms != null) {
    int termCount = 0;
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;
    while(true) {
      if (termCount++ == termCountHardLimit) {
        // app is misusing the API (there is more than
        // one term per doc); in this case we make best
        // effort to load what we can (see LUCENE-2142)
        break;
      }

      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      final long pointer = bytes.copyUsingLengthPrefix(term);
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        docToOffset.set(docID, pointer);
      }
    }
  }

  final PackedInts.Reader offsetReader = docToOffset.getMutable();
  Bits docsWithField = new Bits() {
    @Override
    public boolean get(int index) {
      return offsetReader.get(index) != 0;
    }

    @Override
    public int length() {
      return maxDoc;
    }
  };

  wrapper.setDocsWithField(reader, key.field, docsWithField, null);
  // maybe an int-only impl?
  return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}

Java Code Examples for org.apache.lucene.index.Terms#size()