org.apache.lucene.index.TermsEnum Java Exaples

Source File: TestCompressingTermVectorsFormat.java From lucene-solr with Apache License 2.0

6 votes

public void testNoOrds() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter iw = new RandomIndexWriter(random(), dir);
  Document doc = new Document();
  FieldType ft = new FieldType(TextField.TYPE_NOT_STORED);
  ft.setStoreTermVectors(true);
  doc.add(new Field("foo", "this is a test", ft));
  iw.addDocument(doc);
  LeafReader ir = getOnlyLeafReader(iw.getReader());
  Terms terms = ir.getTermVector(0, "foo");
  assertNotNull(terms);
  TermsEnum termsEnum = terms.iterator();
  assertEquals(SeekStatus.FOUND, termsEnum.seekCeil(new BytesRef("this")));

  expectThrows(UnsupportedOperationException.class, termsEnum::ord);
  expectThrows(UnsupportedOperationException.class, () -> termsEnum.seekExact(0));

  ir.close();
  iw.close();
  dir.close();
}

Source File: TestUtil.java From lucene-solr with Apache License 2.0

6 votes

public static PostingsEnum docs(Random random, TermsEnum termsEnum, PostingsEnum reuse, int flags) throws IOException {
  // TODO: simplify this method? it would be easier to randomly either use the flags passed, or do the random selection,
  // FREQS should be part fo the random selection instead of outside on its own?
  if (random.nextBoolean()) {
    if (random.nextBoolean()) {
      final int posFlags;
      switch (random.nextInt(4)) {
        case 0: posFlags = PostingsEnum.POSITIONS; break;
        case 1: posFlags = PostingsEnum.OFFSETS; break;
        case 2: posFlags = PostingsEnum.PAYLOADS; break;
        default: posFlags = PostingsEnum.ALL; break;
      }
      return termsEnum.postings(null, posFlags);
    }
    flags |= PostingsEnum.FREQS;
  }
  return termsEnum.postings(reuse, flags);
}

Source File: DfsOnlyRequest.java From Elasticsearch with Apache License 2.0

6 votes

public DfsOnlyRequest(Fields termVectorsFields, String[] indices, String[] types, Set<String> selectedFields) throws IOException {
    super(indices);

    // build a search request with a query of all the terms
    final BoolQueryBuilder boolBuilder = boolQuery();
    for (String fieldName : termVectorsFields) {
        if ((selectedFields != null) && (!selectedFields.contains(fieldName))) {
            continue;
        }
        Terms terms = termVectorsFields.terms(fieldName);
        TermsEnum iterator = terms.iterator();
        while (iterator.next() != null) {
            String text = iterator.term().utf8ToString();
            boolBuilder.should(QueryBuilders.termQuery(fieldName, text));
        }
    }
    // wrap a search request object
    this.searchRequest = new SearchRequest(indices).types(types).source(new SearchSourceBuilder().query(boolBuilder));
}

Source File: BlockTermsWriter.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void write(Fields fields, NormsProducer norms) throws IOException {

  for(String field : fields) {

    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }

    TermsEnum termsEnum = terms.iterator();

    TermsWriter termsWriter = addField(fieldInfos.fieldInfo(field));

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }

      termsWriter.write(term, termsEnum, norms);
    }

    termsWriter.finish();
  }
}

Source File: SimpleNaiveBayesClassifier.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Calculate probabilities for all classes for a given input text
 * @param inputDocument the input text as a {@code String}
 * @return a {@code List} of {@code ClassificationResult}, one for each existing class
 * @throws IOException if assigning probabilities fails
 */
protected List<ClassificationResult<BytesRef>> assignClassNormalizedList(String inputDocument) throws IOException {
  List<ClassificationResult<BytesRef>> assignedClasses = new ArrayList<>();

  Terms classes = MultiTerms.getTerms(indexReader, classFieldName);
  if (classes != null) {
    TermsEnum classesEnum = classes.iterator();
    BytesRef next;
    String[] tokenizedText = tokenize(inputDocument);
    int docsWithClassSize = countDocsWithClass();
    while ((next = classesEnum.next()) != null) {
      if (next.length > 0) {
        Term term = new Term(this.classFieldName, next);
        double clVal = calculateLogPrior(term, docsWithClassSize) + calculateLogLikelihood(tokenizedText, term, docsWithClassSize);
        assignedClasses.add(new ClassificationResult<>(term.bytes(), clVal));
      }
    }
  }
  // normalization; the values transforms to a 0-1 range
  return normClassificationResults(assignedClasses);
}

Source File: ReconstructCommand.java From clue with Apache License 2.0

6 votes

public String reconstructNoPositions(TermsEnum te, int docid, Bits liveDocs) throws IOException{
  List<String> textList = new ArrayList<String>();
  BytesRef text;
  PostingsEnum postings = null;
  while ((text = te.next()) != null) {
    postings = te.postings(postings, PostingsEnum.FREQS);
    int iterDoc = postings.advance(docid);
    if (iterDoc == docid) {
      textList.add(text.utf8ToString());
    }
  }
  StringBuilder buf = new StringBuilder();
  for (String s : textList) {
    buf.append(s+" ");
  }
  return buf.toString();
}

Source File: TestRTGBase.java From lucene-solr with Apache License 2.0

6 votes

protected int getFirstMatch(IndexReader r, Term t) throws IOException {
  Terms terms = MultiTerms.getTerms(r, t.field());
  if (terms == null) return -1;
  BytesRef termBytes = t.bytes();
  final TermsEnum termsEnum = terms.iterator();
  if (!termsEnum.seekExact(termBytes)) {
    return -1;
  }
  PostingsEnum docs = termsEnum.postings(null, PostingsEnum.NONE);
  docs = BitsFilteredPostingsEnum.wrap(docs, MultiBits.getLiveDocs(r));
  int id = docs.nextDoc();
  if (id != DocIdSetIterator.NO_MORE_DOCS) {
    int next = docs.nextDoc();
    assertEquals(DocIdSetIterator.NO_MORE_DOCS, next);
  }
  return id == DocIdSetIterator.NO_MORE_DOCS ? -1 : id;
}

Source File: ShardSplittingQuery.java From crate with Apache License 2.0

6 votes

private static void findSplitDocs(String idField, Predicate<BytesRef> includeInShard, LeafReader leafReader,
                                  IntConsumer consumer) throws IOException {
    Terms terms = leafReader.terms(idField);
    TermsEnum iterator = terms.iterator();
    BytesRef idTerm;
    PostingsEnum postingsEnum = null;
    while ((idTerm = iterator.next()) != null) {
        if (includeInShard.test(idTerm) == false) {
            postingsEnum = iterator.postings(postingsEnum);
            int doc;
            while ((doc = postingsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                consumer.accept(doc);
            }
        }
    }
}

Source File: ReadmeSimilarityCalculator.java From scava with Eclipse Public License 2.0

6 votes

private DocVector[] getDocumentVectors() throws IOException {
	DocVector[] docVector = new DocVector[getTotalDocumentInIndex()];
	for (int docId = 0; docId < getTotalDocumentInIndex(); docId++) {
		Terms vector = getIndexReader().getTermVector(docId, FIELD_CONTENT);
		TermsEnum termsEnum = null;
		termsEnum = vector.iterator();
		BytesRef text = null;
		docVector[docId] = new DocVector(getAllTerms());
		while ((text = termsEnum.next()) != null) {
			String term = text.utf8ToString();
			int freq = (int) termsEnum.totalTermFreq();
			docVector[docId].setEntry(term, freq);
		}
		docVector[docId].normalize();
	}
	getIndexReader().close();
	return docVector;
}

Source File: TermVectorsAdapter.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns the term vectors for the specified field in the specified document.
 * If no term vector is available for the field, empty list is returned.
 *
 * @param docid - document id
 * @param field - field name
 * @return list of term vector elements
 * @throws IOException - if there is a low level IO error.
 */
List<TermVectorEntry> getTermVector(int docid, String field) throws IOException {
  Terms termVector = reader.getTermVector(docid, field);
  if (termVector == null) {
    // no term vector available
    log.warn("No term vector indexed for doc: #{} and field: {}", docid, field);
    return Collections.emptyList();
  }

  List<TermVectorEntry> res = new ArrayList<>();
  TermsEnum te = termVector.iterator();
  while (te.next() != null) {
    res.add(TermVectorEntry.of(te));
  }
  return res;
}

Source File: IndexImporter.java From incubator-retired-blur with Apache License 2.0

6 votes

private void runOldMergeSortRowIdCheckAndDelete(boolean emitDeletes, IndexReader currentIndexReader,
    BlurPartitioner blurPartitioner, Text key, int numberOfShards, int shardId, Action action,
    AtomicReader atomicReader) throws IOException {
  MergeSortRowIdLookup lookup = new MergeSortRowIdLookup(currentIndexReader);
  Fields fields = atomicReader.fields();
  Terms terms = fields.terms(BlurConstants.ROW_ID);
  if (terms != null) {
    TermsEnum termsEnum = terms.iterator(null);
    BytesRef ref = null;
    while ((ref = termsEnum.next()) != null) {
      key.set(ref.bytes, ref.offset, ref.length);
      int partition = blurPartitioner.getPartition(key, null, numberOfShards);
      if (shardId != partition) {
        throw new IOException("Index is corrupted, RowIds are found in wrong shard, partition [" + partition
            + "] does not shard [" + shardId + "], this can happen when rows are not hashed correctly.");
      }
      if (emitDeletes) {
        lookup.lookup(ref, action);
      }
    }
  }
}

Source File: LindenFieldCacheImpl.java From linden with Apache License 2.0

6 votes

@Override
protected Accountable createValue(final AtomicReader reader, CacheKey key, boolean setDocsWithField)
    throws IOException {
  final Map<String, Integer> uidMap = new HashMap<>();

  Uninvert u = new Uninvert() {
    private String currentValue;

    @Override
    public void visitTerm(BytesRef term) {
      currentValue = term.utf8ToString();
    }

    @Override
    public void visitDoc(int docID) {
      uidMap.put(currentValue, docID);
    }

    @Override
    protected TermsEnum termsEnum(Terms terms) throws IOException {
      return terms.iterator(null);
    }
  };
  u.uninvert(reader, key.field, setDocsWithField);
  return new PerReaderUIDMaps(reader.getContext().ord, uidMap);
}

Source File: SolrRangeQuery.java From lucene-solr with Apache License 2.0

6 votes

public RangeTermsEnum(Terms terms) throws IOException {
  if (terms == null) {
    positioned = true;
  } else {
    te = terms.iterator();
    if (lower != null) {
      TermsEnum.SeekStatus status = te.seekCeil(lower);
      if (status == TermsEnum.SeekStatus.END) {
        positioned = true;
        curr = null;
      } else if (status == SeekStatus.FOUND) {
        positioned = includeLower();
        curr = te.term();
      } else {
        // lower bound not found, so includeLower is irrelevant
        positioned = true;
        curr = te.term();
      }
    }
  }
}

Source File: HashTermStatistics.java From liresolr with GNU General Public License v2.0

6 votes

public static void addToStatistics(SolrIndexSearcher searcher, String field) throws IOException {
        // check if this field is already in the stats.
//        synchronized (instance) {
            if (termstats.get(field)!=null) return;
//        }
        // else add it to the stats.
        Terms terms = searcher.getSlowAtomicReader().terms(field);
        HashMap<String, Integer> term2docFreq = new HashMap<String, Integer>(1000);
        termstats.put(field, term2docFreq);
        if (terms!=null) {
            TermsEnum termsEnum = terms.iterator();
            BytesRef term;
            while ((term = termsEnum.next()) != null) {
                term2docFreq.put(term.utf8ToString(), termsEnum.docFreq());
            }
        }
    }

Source File: UniformSplitTermsWriter.java From lucene-solr with Apache License 2.0

6 votes

@Override
public void write(Fields fields, NormsProducer normsProducer) throws IOException {
  BlockWriter blockWriter = new BlockWriter(blockOutput, targetNumBlockLines, deltaNumLines, blockEncoder);
  ByteBuffersDataOutput fieldsOutput = new ByteBuffersDataOutput();
  int fieldsNumber = 0;
  for (String field : fields) {
    Terms terms = fields.terms(field);
    if (terms != null) {
      TermsEnum termsEnum = terms.iterator();
      FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
      fieldsNumber += writeFieldTerms(blockWriter, fieldsOutput, termsEnum, fieldInfo, normsProducer);
    }
  }
  writeFieldsMetadata(fieldsNumber, fieldsOutput);
  CodecUtil.writeFooter(dictionaryOutput);
}

Source File: DisjunctionMatchesIterator.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
 *
 * Only terms that have at least one match in the given document will be included
 */
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
  Objects.requireNonNull(field);
  Terms t = context.reader().terms(field);
  if (t == null)
    return null;
  TermsEnum te = t.iterator();
  PostingsEnum reuse = null;
  for (BytesRef term = terms.next(); term != null; term = terms.next()) {
    if (te.seekExact(term)) {
      PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
      if (pe.advance(doc) == doc) {
        return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
      }
      else {
        reuse = pe;
      }
    }
  }
  return null;
}

Source File: DocToDoubleVectorUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * create a sparse <code>Double</code> vector given doc and field term vectors using local frequency of the terms in the doc
 *
 * @param docTerms   term vectors for a given document
 * @param fieldTerms field term vectors
 * @return a sparse vector of <code>Double</code>s as an array
 * @throws IOException in case accessing the underlying index fails
 */
public static Double[] toSparseLocalFreqDoubleArray(Terms docTerms, Terms fieldTerms) throws IOException {
  TermsEnum fieldTermsEnum = fieldTerms.iterator();
  Double[] freqVector = null;
  if (docTerms != null && fieldTerms.size() > -1) {
    freqVector = new Double[(int) fieldTerms.size()];
    int i = 0;
    TermsEnum docTermsEnum = docTerms.iterator();
    BytesRef term;
    while ((term = fieldTermsEnum.next()) != null) {
      TermsEnum.SeekStatus seekStatus = docTermsEnum.seekCeil(term);
      if (seekStatus.equals(TermsEnum.SeekStatus.END)) {
        docTermsEnum = docTerms.iterator();
      }
      if (seekStatus.equals(TermsEnum.SeekStatus.FOUND)) {
        long termFreqLocal = docTermsEnum.totalTermFreq(); // the total number of occurrences of this term in the given document
        freqVector[i] = Long.valueOf(termFreqLocal).doubleValue();
      } else {
        freqVector[i] = 0d;
      }
      i++;
    }
  }
  return freqVector;
}

Source File: TermGroupFacetCollector.java From lucene-solr with Apache License 2.0

5 votes

SegmentResult(int[] counts, int total, TermsEnum tenum, int startFacetOrd, int endFacetOrd) throws IOException {
  super(counts, total - counts[0], counts[0], endFacetOrd+1);
  this.tenum = tenum;
  this.mergePos = startFacetOrd == -1 ? 1 : startFacetOrd+1;
  if (mergePos < maxTermPos) {
    assert tenum != null;
    tenum.seekExact(startFacetOrd == -1 ? 0 : startFacetOrd);
    mergeTerm = tenum.term();
  }
}

Source File: CompressingTermVectorsReader.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TermsEnum iterator() throws IOException {
  TVTermsEnum termsEnum = new TVTermsEnum();
  termsEnum.reset(numTerms, flags, prefixLengths, suffixLengths, termFreqs, positionIndex, positions, startOffsets, lengths,
      payloadIndex, payloadBytes,
      new ByteArrayDataInput(termBytes.bytes, termBytes.offset, termBytes.length));
  return termsEnum;
}

Source File: SeekingTermSetTermsEnum.java From lucene-solr with Apache License 2.0

5 votes

/**
 * Constructor
 */
public SeekingTermSetTermsEnum(TermsEnum tenum, BytesRefHash terms, int[] ords) {
  super(tenum);
  this.terms = terms;
  this.ords = ords;
  lastElement = terms.size() - 1;
  lastTerm = terms.get(ords[lastElement], new BytesRef());
  seekTerm = terms.get(ords[upto], spare);
}

Source File: FSTTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void write(Fields fields, NormsProducer norms) throws IOException {
  for(String field : fields) {
    Terms terms = fields.terms(field);
    if (terms == null) {
      continue;
    }
    FieldInfo fieldInfo = fieldInfos.fieldInfo(field);
    boolean hasFreq = fieldInfo.getIndexOptions().compareTo(IndexOptions.DOCS_AND_FREQS) >= 0;
    TermsEnum termsEnum = terms.iterator();
    TermsWriter termsWriter = new TermsWriter(fieldInfo);

    long sumTotalTermFreq = 0;
    long sumDocFreq = 0;
    FixedBitSet docsSeen = new FixedBitSet(maxDoc);

    while (true) {
      BytesRef term = termsEnum.next();
      if (term == null) {
        break;
      }
          
      BlockTermState termState = postingsWriter.writeTerm(term, termsEnum, docsSeen, norms);
      if (termState != null) {
        termsWriter.finishTerm(term, termState);
        sumTotalTermFreq += termState.totalTermFreq;
        sumDocFreq += termState.docFreq;
      }
    }

    termsWriter.finish(hasFreq ? sumTotalTermFreq : -1, sumDocFreq, docsSeen.cardinality());
  }
}

Source File: FieldReader.java From lucene-solr with Apache License 2.0

5 votes

@Override
public TermsEnum intersect(CompiledAutomaton compiled, BytesRef startTerm) throws IOException {
  // if (DEBUG) System.out.println("  FieldReader.intersect startTerm=" + BlockTreeTermsWriter.brToString(startTerm));
  //System.out.println("intersect: " + compiled.type + " a=" + compiled.automaton);
  // TODO: we could push "it's a range" or "it's a prefix" down into IntersectTermsEnum?
  // can we optimize knowing that...?
  if (compiled.type != CompiledAutomaton.AUTOMATON_TYPE.NORMAL) {
    throw new IllegalArgumentException("please use CompiledAutomaton.getTermsEnum instead");
  }
  return new IntersectTermsEnum(this, compiled.automaton, compiled.runAutomaton, compiled.commonSuffixRef, startTerm);
}

Source File: DisjunctionMatchesIterator.java From lucene-solr with Apache License 2.0

5 votes

TermsEnumDisjunctionMatchesIterator(MatchesIterator first, BytesRefIterator terms, TermsEnum te, int doc, Query query) {
  this.first = first;
  this.terms = terms;
  this.te = te;
  this.doc = doc;
  this.query = query;
}

Source File: QueryAutoFilteringComponent.java From query-autofiltering-component with Apache License 2.0

5 votes

private void buildFieldMap( ResponseBuilder rb ) throws IOException {
  Log.debug( "buildFieldMap" );
  SolrIndexSearcher searcher = rb.req.getSearcher();
  // build a synonym map from the SortedDocValues -
  // for each field value: lower case, stemmed, lookup synonyms from synonyms.txt - map to fieldValue
  SynonymMap.Builder fieldBuilder = new SynonymMap.Builder( true );
  SynonymMap.Builder termBuilder = new SynonymMap.Builder( true );
    
  ArrayList<String> searchFields = getStringFields( searcher );

  for (String searchField : searchFields ) {
    Log.debug( "adding searchField " + searchField );
    CharsRef fieldChars = new CharsRef( searchField );
    SortedSetDocValues sdv = FieldCache.DEFAULT.getDocTermOrds( searcher.getAtomicReader( ), searchField );
    if (sdv == null) continue;
    Log.debug( "got SortedSetDocValues for " + searchField );
    TermsEnum te = sdv.termsEnum();
    while (te.next() != null) {
      BytesRef term = te.term();
      String fieldValue = term.utf8ToString( );
      addTerm ( fieldChars, fieldValue, fieldBuilder, termBuilder );
    }
  }
    
  addDistributedTerms( rb, fieldBuilder, termBuilder, searchFields );
    
  fieldMap = fieldBuilder.build( );
  termMap = termBuilder.build( );
}

Source File: JoinDocFreqValueSource.java From lucene-solr with Apache License 2.0

5 votes

@Override
public FunctionValues getValues(Map<Object, Object> context, LeafReaderContext readerContext) throws IOException
{
  final BinaryDocValues terms = DocValues.getBinary(readerContext.reader(), field);
  final IndexReader top = ReaderUtil.getTopLevelContext(readerContext).reader();
  Terms t = MultiTerms.getTerms(top, qfield);
  final TermsEnum termsEnum = t == null ? TermsEnum.EMPTY : t.iterator();
  
  return new IntDocValues(this) {

    int lastDocID = -1;

    @Override
    public int intVal(int doc) throws IOException {
      if (doc < lastDocID) {
        throw new IllegalArgumentException("docs were sent out-of-order: lastDocID=" + lastDocID + " vs docID=" + doc);
      }
      lastDocID = doc;
      int curDocID = terms.docID();
      if (doc > curDocID) {
        curDocID = terms.advance(doc);
      }
      if (doc == curDocID) {
        BytesRef term = terms.binaryValue();
        if (termsEnum.seekExact(term)) {
          return termsEnum.docFreq();
        }
      }
      return 0;
    }
  };
}

Source File: TermPrefixCursor.java From SolrTextTagger with Apache License 2.0

5 votes

/** Seeks to prefixBuf or the next term that is prefixed by prefixBuf plus the separator char.
 * Sets docIds. **/
private boolean seekPrefix() throws IOException {
  TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefixBuf);

  docIds = null;//invalidate
  switch (seekStatus) {
    case END:
      return false;

    case FOUND:
      postingsEnum = termsEnum.postings(postingsEnum, PostingsEnum.NONE);
      docIds = postingsEnumToIntsRef(postingsEnum, liveDocs);
      if (docIds.length > 0) {
        return true;
      }

      //Pretend we didn't find it; go to next term
      docIds = null;
      if (termsEnum.next() == null) { // case END
        return false;
      }
      //fall through to NOT_FOUND

    case NOT_FOUND:
      //termsEnum must start with prefixBuf to continue
      BytesRef teTerm = termsEnum.term();

      if (teTerm.length > prefixBuf.length) {
        for (int i = 0; i < prefixBuf.length; i++) {
          if (prefixBuf.bytes[prefixBuf.offset + i] != teTerm.bytes[teTerm.offset + i])
            return false;
        }
        if (teTerm.bytes[teTerm.offset + prefixBuf.length] != SEPARATOR_CHAR)
          return false;
        return true;
      }
      return false;
  }
  throw new IllegalStateException(seekStatus.toString());
}

Source File: TestOrdsBlockTree.java From lucene-solr with Apache License 2.0

5 votes

public void testBasic() throws Exception {
  Directory dir = newDirectory();
  RandomIndexWriter w = new RandomIndexWriter(random(), dir);
  Document doc = new Document();
  doc.add(newTextField("field", "a b c", Field.Store.NO));
  w.addDocument(doc);
  IndexReader r = w.getReader();
  TermsEnum te = MultiTerms.getTerms(r, "field").iterator();

  // Test next()
  assertEquals(new BytesRef("a"), te.next());
  assertEquals(0L, te.ord());
  assertEquals(new BytesRef("b"), te.next());
  assertEquals(1L, te.ord());
  assertEquals(new BytesRef("c"), te.next());
  assertEquals(2L, te.ord());
  assertNull(te.next());

  // Test seekExact by term
  assertTrue(te.seekExact(new BytesRef("b")));
  assertEquals(1, te.ord());
  assertTrue(te.seekExact(new BytesRef("a")));
  assertEquals(0, te.ord());
  assertTrue(te.seekExact(new BytesRef("c")));
  assertEquals(2, te.ord());

  // Test seekExact by ord
  te.seekExact(1);
  assertEquals(new BytesRef("b"), te.term());
  te.seekExact(0);
  assertEquals(new BytesRef("a"), te.term());
  te.seekExact(2);
  assertEquals(new BytesRef("c"), te.term());

  r.close();
  w.close();
  dir.close();
}

Source File: BlockTreeTermsWriter.java From lucene-solr with Apache License 2.0

5 votes

/** Writes one term's worth of postings. */
public void write(BytesRef text, TermsEnum termsEnum, NormsProducer norms) throws IOException {
  /*
  if (DEBUG) {
    int[] tmp = new int[lastTerm.length];
    System.arraycopy(prefixStarts, 0, tmp, 0, tmp.length);
    System.out.println("BTTW: write term=" + brToString(text) + " prefixStarts=" + Arrays.toString(tmp) + " pending.size()=" + pending.size());
  }
  */

  BlockTermState state = postingsWriter.writeTerm(text, termsEnum, docsSeen, norms);
  if (state != null) {

    assert state.docFreq != 0;
    assert fieldInfo.getIndexOptions() == IndexOptions.DOCS || state.totalTermFreq >= state.docFreq: "postingsWriter=" + postingsWriter;
    pushTerm(text);
   
    PendingTerm term = new PendingTerm(text, state);
    pending.add(term);
    //if (DEBUG) System.out.println("    add pending term = " + text + " pending.size()=" + pending.size());

    sumDocFreq += state.docFreq;
    sumTotalTermFreq += state.totalTermFreq;
    numTerms++;
    if (firstPendingTerm == null) {
      firstPendingTerm = term;
    }
    lastPendingTerm = term;
  }
}

Source File: SimpleTextTermVectorsReader.java From lucene-solr with Apache License 2.0

5 votes

@Override
public long getSumTotalTermFreq() throws IOException {
  // TODO: make it constant-time
  long ttf = 0;
  TermsEnum iterator = iterator();
  for (BytesRef b = iterator.next(); b != null; b = iterator.next()) {
    ttf += iterator.totalTermFreq();
  }
  return ttf;
}

Source File: MultiPhrasePrefixQuery.java From crate with Apache License 2.0

5 votes

private void getPrefixTerms(ObjectHashSet<Term> terms, final Term prefix, final IndexReader reader) throws IOException {
    // SlowCompositeReaderWrapper could be used... but this would merge all terms from each segment into one terms
    // instance, which is very expensive. Therefore I think it is better to iterate over each leaf individually.
    List<LeafReaderContext> leaves = reader.leaves();
    for (LeafReaderContext leaf : leaves) {
        Terms _terms = leaf.reader().terms(field);
        if (_terms == null) {
            continue;
        }

        TermsEnum termsEnum = _terms.iterator();
        TermsEnum.SeekStatus seekStatus = termsEnum.seekCeil(prefix.bytes());
        if (TermsEnum.SeekStatus.END == seekStatus) {
            continue;
        }

        for (BytesRef term = termsEnum.term(); term != null; term = termsEnum.next()) {
            if (!StringHelper.startsWith(term, prefix.bytes())) {
                break;
            }

            terms.add(new Term(field, BytesRef.deepCopyOf(term)));
            if (terms.size() >= maxExpansions) {
                return;
            }
        }
    }
}

org.apache.lucene.index.TermsEnum Java Examples