org.apache.lucene.index.LeafReader#terms

Source File: GeoPointArrayIndexFieldData.java From Elasticsearch with Apache License 2.0

6 votes

@Override
public AtomicGeoPointFieldData loadDirect(LeafReaderContext context) throws Exception {
    LeafReader reader = context.reader();

    Terms terms = reader.terms(getFieldNames().indexName());
    AtomicGeoPointFieldData data = null;
    // TODO: Use an actual estimator to estimate before loading.
    NonEstimatingEstimator estimator = new NonEstimatingEstimator(breakerService.getBreaker(CircuitBreaker.FIELDDATA));
    if (terms == null) {
        data = AbstractAtomicGeoPointFieldData.empty(reader.maxDoc());
        estimator.afterLoad(null, data.ramBytesUsed());
        return data;
    }
    return (Version.indexCreated(indexSettings).before(Version.V_2_2_0)) ?
        loadLegacyFieldData(reader, estimator, terms, data) : loadFieldData22(reader, estimator, terms, data);
}

Source File: CodecCollector.java From mtas with Apache License 2.0

6 votes

/**
 * Collect collection.
 *
 * @param reader
 *          the reader
 * @param docSet
 *          the doc set
 * @param collectionInfo
 *          the collection info
 * @throws IOException
 *           Signals that an I/O exception has occurred.
 */
public static void collectCollection(IndexReader reader, List<Integer> docSet,
    ComponentCollection collectionInfo) throws IOException {
  if (collectionInfo.action().equals(ComponentCollection.ACTION_CHECK)) {
    // can't do anything in lucene for check
  } else if (collectionInfo.action()
      .equals(ComponentCollection.ACTION_LIST)) {
    // can't do anything in lucene for list
  } else if (collectionInfo.action()
      .equals(ComponentCollection.ACTION_CREATE)) {
    BytesRef term = null;
    PostingsEnum postingsEnum = null;
    Integer docId;
    Integer termDocId = -1;
    Terms terms;
    LeafReaderContext lrc;
    LeafReader r;
    ListIterator<LeafReaderContext> iterator = reader.leaves().listIterator();
    while (iterator.hasNext()) {
      lrc = iterator.next();
      r = lrc.reader();
      for (String field : collectionInfo.fields()) {
        if ((terms = r.terms(field)) != null) {
          TermsEnum termsEnum = terms.iterator();
          while ((term = termsEnum.next()) != null) {
            Iterator<Integer> docIterator = docSet.iterator();
            postingsEnum = termsEnum.postings(postingsEnum,
                PostingsEnum.NONE);
            termDocId = -1;
            while (docIterator.hasNext()) {
              docId = docIterator.next() - lrc.docBase;
              if ((docId >= termDocId) && ((docId.equals(termDocId))
                  || ((termDocId = postingsEnum.advance(docId))
                      .equals(docId)))) {
                collectionInfo.addValue(term.utf8ToString());
                break;
              }
              if (termDocId.equals(PostingsEnum.NO_MORE_DOCS)) {
                break;
              }
            }
          }
        }
      }
    }
  }
}

Source File: ShardSplittingQuery.java From crate with Apache License 2.0

6 votes

private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader,
                                  IntConsumer consumer) throws IOException {
    Terms terms = leafReader.terms(idField);
    TermsEnum iterator = terms.iterator();
    BytesRef idTerm;
    PostingsEnum postingsEnum = null;
    while ((idTerm = iterator.next()) != null) {
        if (includeInShard.test(idTerm) == false) {
            postingsEnum = iterator.postings(postingsEnum);
            int doc;
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                consumer.accept(doc);
            }
        }
    }
}

Source File: AbstractPrefixTreeQuery.java From lucene-solr with Apache License 2.0

5 votes

public BaseTermsEnumTraverser(LeafReaderContext context) throws IOException {
  this.context = context;
  LeafReader reader = context.reader();
  this.maxDoc = reader.maxDoc();
  terms = reader.terms(fieldName);
  if (terms != null) {
    this.termsEnum = terms.iterator();
  } else {
    this.termsEnum = null;
  }
}

Source File: CompletionWeight.java From lucene-solr with Apache License 2.0

5 votes

@Override
public BulkScorer bulkScorer(final LeafReaderContext context) throws IOException {
  final LeafReader reader = context.reader();
  final Terms terms;
  final NRTSuggester suggester;
  if ((terms = reader.terms(completionQuery.getField())) == null) {
    return null;
  }
  if (terms instanceof CompletionTerms) {
    CompletionTerms completionTerms = (CompletionTerms) terms;
    if ((suggester = completionTerms.suggester()) == null) {
      // a segment can have a null suggester
      // i.e. no FST was built
      return null;
    }
  } else {
    throw new IllegalArgumentException(completionQuery.getField() + " is not a SuggestField");
  }

  BitsProducer filter = completionQuery.getFilter();
  Bits filteredDocs = null;
  if (filter != null) {
    filteredDocs = filter.getBits(context);
    if (filteredDocs.getClass() == Bits.MatchNoBits.class) {
      return null;
    }
  }
  return new CompletionScorer(this, suggester, reader, filteredDocs, filter != null, automaton);
}

Source File: DirectoryTaxonomyWriter.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Takes the categories from the given taxonomy directory, and adds the
 * missing ones to this taxonomy. Additionally, it fills the given
 * {@link OrdinalMap} with a mapping from the original ordinal to the new
 * ordinal.
 */
public void addTaxonomy(Directory taxoDir, OrdinalMap map) throws IOException {
  ensureOpen();
  DirectoryReader r = DirectoryReader.open(taxoDir);
  try {
    final int size = r.numDocs();
    final OrdinalMap ordinalMap = map;
    ordinalMap.setSize(size);
    int base = 0;
    PostingsEnum docs = null;
    for (final LeafReaderContext ctx : r.leaves()) {
      final LeafReader ar = ctx.reader();
      final Terms terms = ar.terms(Consts.FULL);
      // TODO: share per-segment TermsEnum here!
      TermsEnum te = terms.iterator();
      while (te.next() != null) {
        FacetLabel cp = new FacetLabel(FacetsConfig.stringToPath(te.term().utf8ToString()));
        final int ordinal = addCategory(cp);
        docs = te.postings(docs, PostingsEnum.NONE);
        ordinalMap.addMapping(docs.nextDoc() + base, ordinal);
      }
      base += ar.maxDoc(); // no deletions, so we're ok
    }
    ordinalMap.addDone();
  } finally {
    r.close();
  }
}

Source File: TermFilteredPresearcher.java From lucene-solr with Apache License 2.0

5 votes

private Query buildFilterClause(LeafReader reader, String field) throws IOException {

    Terms terms = reader.terms(field);
    if (terms == null)
      return null;

    BooleanQuery.Builder bq = new BooleanQuery.Builder();

    int docsInBatch = reader.maxDoc();

    BytesRef term;
    TermsEnum te = terms.iterator();
    while ((term = te.next()) != null) {
      // we need to check that every document in the batch has the same field values, otherwise
      // this filtering will not work
      if (te.docFreq() != docsInBatch)
        throw new IllegalArgumentException("Some documents in this batch do not have a term value of "
            + field + ":" + Term.toString(term));
      bq.add(new TermQuery(new Term(field, BytesRef.deepCopyOf(term))), BooleanClause.Occur.SHOULD);
    }

    BooleanQuery built = bq.build();

    if (built.clauses().size() == 0)
      return null;

    return built;
  }

Source File: CompletionQuery.java From lucene-solr with Apache License 2.0

4 votes

@Override
public Query rewrite(IndexReader reader) throws IOException {
  byte type = 0;
  boolean first = true;
  Terms terms;
  for (LeafReaderContext context : reader.leaves()) {
    LeafReader leafReader = context.reader();
    try {
      if ((terms = leafReader.terms(getField())) == null) {
        continue;
      }
    } catch (IOException e) {
      continue;
    }
    if (terms instanceof CompletionTerms) {
      CompletionTerms completionTerms = (CompletionTerms) terms;
      byte t = completionTerms.getType();
      if (first) {
        type = t;
        first = false;
      } else if (type != t) {
        throw new IllegalStateException(getField() + " has values of multiple types");
      }
    }
  }

  if (first == false) {
    if (this instanceof ContextQuery) {
      if (type == SuggestField.TYPE) {
        throw new IllegalStateException(this.getClass().getSimpleName()
            + " can not be executed against a non context-enabled SuggestField: "
            + getField());
      }
    } else {
      if (type == ContextSuggestField.TYPE) {
        return new ContextQuery(this);
      }
    }
  }
  return super.rewrite(reader);
}

Source File: TermFilteredPresearcher.java From lucene-solr with Apache License 2.0

4 votes

@Override
public final Query buildQuery(LeafReader reader, BiPredicate<String, BytesRef> termAcceptor) {
  try {
    DocumentQueryBuilder queryBuilder = getQueryBuilder();
    for (FieldInfo field : reader.getFieldInfos()) {

      Terms terms = reader.terms(field.name);
      if (terms == null) {
        continue;
      }

      TokenStream ts = new TermsEnumTokenStream(terms.iterator());
      for (CustomQueryHandler handler : queryHandlers) {
        ts = handler.wrapTermStream(field.name, ts);
      }

      ts = new FilteringTokenFilter(ts) {
        TermToBytesRefAttribute termAtt = addAttribute(TermToBytesRefAttribute.class);
        @Override
        protected boolean accept() {
          return filterFields.contains(field.name) == false && termAcceptor.test(field.name, termAtt.getBytesRef());
        }
      };

      TermToBytesRefAttribute termAtt = ts.addAttribute(TermToBytesRefAttribute.class);
      while (ts.incrementToken()) {
        queryBuilder.addTerm(field.name, BytesRef.deepCopyOf(termAtt.getBytesRef()));
      }
      ts.close();

    }
    Query presearcherQuery = queryBuilder.build();

    BooleanQuery.Builder bq = new BooleanQuery.Builder();
    bq.add(presearcherQuery, BooleanClause.Occur.SHOULD);
    bq.add(new TermQuery(new Term(ANYTOKEN_FIELD, ANYTOKEN)), BooleanClause.Occur.SHOULD);
    presearcherQuery = bq.build();
    if (filterFields.isEmpty() == false) {
      bq = new BooleanQuery.Builder();
      bq.add(presearcherQuery, BooleanClause.Occur.MUST);
      Query filterQuery = buildFilterFields(reader);
      if (filterQuery != null) {
        bq.add(filterQuery, BooleanClause.Occur.FILTER);
        presearcherQuery = bq.build();
      }
    }
    return presearcherQuery;
  } catch (IOException e) {
    // We're a MemoryIndex, so this shouldn't happen...
    throw new RuntimeException(e);
  }
}

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  final int maxDoc = reader.maxDoc();

  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final PagedBytes bytes = new PagedBytes(15);

  int startTermsBPV;

  // TODO: use Uninvert?
  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      startTermsBPV = PackedInts.bitsRequired(numUniqueTerms);
    } else {
      startTermsBPV = 1;
    }
  } else {
    startTermsBPV = 1;
  }

  PackedLongValues.Builder termOrdToBytesOffset = PackedLongValues.monotonicBuilder(PackedInts.COMPACT);
  final GrowableWriter docToTermOrd = new GrowableWriter(startTermsBPV, maxDoc, acceptableOverheadRatio);

  int termOrd = 0;

  // TODO: use Uninvert?

  if (terms != null) {
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;

    while(true) {
      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      if (termOrd >= maxDoc) {
        throw new IllegalStateException("Type mismatch: " + key.field + " was indexed with multiple values per document, use SORTED_SET instead");
      }

      termOrdToBytesOffset.add(bytes.copyUsingLengthPrefix(term));
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        // Store 1+ ord into packed bits
        docToTermOrd.set(docID, 1+termOrd);
      }
      termOrd++;
    }
  }

  // maybe an int-only impl?
  return new SortedDocValuesImpl(bytes.freeze(true), termOrdToBytesOffset.build(), docToTermOrd.getMutable(), termOrd);
}

Source File: FieldCacheImpl.java From lucene-solr with Apache License 2.0

4 votes

@Override
protected Accountable createValue(LeafReader reader, CacheKey key)
    throws IOException {

  // TODO: would be nice to first check if DocTermsIndex
  // was already cached for this field and then return
  // that instead, to avoid insanity

  final int maxDoc = reader.maxDoc();
  Terms terms = reader.terms(key.field);

  final float acceptableOverheadRatio = ((Float) key.custom).floatValue();

  final int termCountHardLimit = maxDoc;

  // Holds the actual term data, expanded.
  final PagedBytes bytes = new PagedBytes(15);

  int startBPV;

  if (terms != null) {
    // Try for coarse estimate for number of bits; this
    // should be an underestimate most of the time, which
    // is fine -- GrowableWriter will reallocate as needed
    long numUniqueTerms = terms.size();
    if (numUniqueTerms != -1L) {
      if (numUniqueTerms > termCountHardLimit) {
        numUniqueTerms = termCountHardLimit;
      }
      startBPV = PackedInts.bitsRequired(numUniqueTerms*4);
    } else {
      startBPV = 1;
    }
  } else {
    startBPV = 1;
  }

  final GrowableWriter docToOffset = new GrowableWriter(startBPV, maxDoc, acceptableOverheadRatio);
  
  // pointer==0 means not set
  bytes.copyUsingLengthPrefix(new BytesRef());

  if (terms != null) {
    int termCount = 0;
    final TermsEnum termsEnum = terms.iterator();
    PostingsEnum docs = null;
    while(true) {
      if (termCount++ == termCountHardLimit) {
        // app is misusing the API (there is more than
        // one term per doc); in this case we make best
        // effort to load what we can (see LUCENE-2142)
        break;
      }

      final BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
      final long pointer = bytes.copyUsingLengthPrefix(term);
      docs = termsEnum.postings(docs, PostingsEnum.NONE);
      while (true) {
        final int docID = docs.nextDoc();
        if (docID == DocIdSetIterator.NO_MORE_DOCS) {
          break;
        }
        docToOffset.set(docID, pointer);
      }
    }
  }

  final PackedInts.Reader offsetReader = docToOffset.getMutable();
  Bits docsWithField = new Bits() {
    @Override
    public boolean get(int index) {
      return offsetReader.get(index) != 0;
    }

    @Override
    public int length() {
      return maxDoc;
    }
  };

  wrapper.setDocsWithField(reader, key.field, docsWithField, null);
  // maybe an int-only impl?
  return new BinaryDocValuesImpl(bytes.freeze(true), offsetReader, docsWithField);
}

Source File: LukeRequestHandler.java From lucene-solr with Apache License 2.0

4 votes

private static SimpleOrderedMap<Object> getIndexedFieldsInfo(SolrQueryRequest req)
    throws Exception {

  SolrIndexSearcher searcher = req.getSearcher();
  SolrParams params = req.getParams();

  Set<String> fields = null;
  String fl = params.get(CommonParams.FL);
  if (fl != null) {
    fields = new TreeSet<>(Arrays.asList(fl.split( "[,\\s]+" )));
  }

  LeafReader reader = searcher.getSlowAtomicReader();
  IndexSchema schema = searcher.getSchema();

  // Don't be tempted to put this in the loop below, the whole point here is to alphabetize the fields!
  Set<String> fieldNames = new TreeSet<>();
  for(FieldInfo fieldInfo : reader.getFieldInfos()) {
    fieldNames.add(fieldInfo.name);
  }

  // Walk the term enum and keep a priority queue for each map in our set
  SimpleOrderedMap<Object> finfo = new SimpleOrderedMap<>();

  for (String fieldName : fieldNames) {
    if (fields != null && ! fields.contains(fieldName) && ! fields.contains("*")) {
      continue; //we're not interested in this field Still an issue here
    }

    SimpleOrderedMap<Object> fieldMap = new SimpleOrderedMap<>();

    SchemaField sfield = schema.getFieldOrNull( fieldName );
    FieldType ftype = (sfield==null)?null:sfield.getType();

    fieldMap.add( "type", (ftype==null)?null:ftype.getTypeName() );
    fieldMap.add("schema", getFieldFlags(sfield));
    if (sfield != null && schema.isDynamicField(sfield.getName()) && schema.getDynamicPattern(sfield.getName()) != null) {
      fieldMap.add("dynamicBase", schema.getDynamicPattern(sfield.getName()));
    }
    Terms terms = reader.terms(fieldName);
    if (terms == null) { // Not indexed, so we need to report what we can (it made it through the fl param if specified)
      finfo.add( fieldName, fieldMap );
      continue;
    }

    if(sfield != null && sfield.indexed() ) {
      if (params.getBool(INCLUDE_INDEX_FIELD_FLAGS,true)) {
        Document doc = getFirstLiveDoc(terms, reader);

        if (doc != null) {
          // Found a document with this field
          try {
            IndexableField fld = doc.getField(fieldName);
            if (fld != null) {
              fieldMap.add("index", getFieldFlags(fld));
            } else {
              // it is a non-stored field...
              fieldMap.add("index", "(unstored field)");
            }
          } catch (Exception ex) {
            log.warn("error reading field: {}", fieldName);
          }
        }
      }
      fieldMap.add("docs", terms.getDocCount());
    }
    if (fields != null && (fields.contains(fieldName) || fields.contains("*"))) {
      getDetailedFieldInfo(req, fieldName, fieldMap);
    }
    // Add the field
    finfo.add( fieldName, fieldMap );
  }
  return finfo;
}

Source File: DocSetInfoCommand.java From clue with Apache License 2.0

4 votes

@Override
public void execute(Namespace args, PrintStream out) throws Exception {
  String field = args.getString("field");
  String termVal = null;
  int bucketSize = args.getInt("size");

  if (field != null){
    String[] parts = field.split(":");
    if (parts.length > 1){
      field = parts[0];
      termVal = parts[1];
    }
  }
  
  IndexReader reader = ctx.getIndexReader();
  List<LeafReaderContext> leaves = reader.leaves();
  

  PostingsEnum postingsEnum = null;
  for (LeafReaderContext leaf : leaves) {
    LeafReader atomicReader = leaf.reader();
    Terms terms = atomicReader.terms(field);
    if (terms == null){
      continue;
    }
    if (terms != null && termVal != null){        
      TermsEnum te = terms.iterator();
      
      if (te.seekExact(new BytesRef(termVal))){
        postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
        
        int docFreq = te.docFreq();
        
        int minDocId = -1, maxDocId = -1;
        int doc, count = 0;
        
        int[] percentDocs = new int[PERCENTILES.length];
        
        int percentileIdx = 0;
        
        while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
          maxDocId = doc;
          if (minDocId == -1) {
            minDocId = doc;
          }
          count ++;
          
          double perDocs = (double) count / (double) docFreq * 100.0;
          while (percentileIdx < percentDocs.length) {
            if (perDocs > PERCENTILES[percentileIdx]) {
              percentDocs[percentileIdx] = doc;
              percentileIdx++;
            } else {
              break;
            }
          }
        }
        
        // calculate histogram          
        int[] buckets = null;
        if (maxDocId > 0) {
          buckets = new int[maxDocId / bucketSize + 1];
          
          postingsEnum = te.postings(postingsEnum, PostingsEnum.FREQS);
          while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
            int bucketIdx = doc / bucketSize;
            buckets[bucketIdx]++;
          }
        }
        
        double density = (double) docFreq / (double) (maxDocId - minDocId) ; 
        out.println(String.format("min: %d, max: %d, count: %d, density: %.2f", minDocId, maxDocId, docFreq, density));
        out.println("percentiles: " + Arrays.toString(PERCENTILES) + " => " + Arrays.toString(percentDocs));
        out.println("histogram: (bucketsize=" + bucketSize+")");
        out.println(Arrays.toString(buckets));
      }
    }
  }
}

Java Code Examples for org.apache.lucene.index.LeafReader#terms()