org.apache.lucene.util.BytesRefIterator Java Exaples

Source File: BytesReference.java From crate with Apache License 2.0

6 votes

@Override
public int hashCode() {
    if (hash == null) {
        final BytesRefIterator iterator = iterator();
        BytesRef ref;
        int result = 1;
        try {
            while ((ref = iterator.next()) != null) {
                for (int i = 0; i < ref.length; i++) {
                    result = 31 * result + ref.bytes[ref.offset + i];
                }
            }
        } catch (IOException ex) {
            throw new AssertionError("wont happen", ex);
        }
        return hash = result;
    } else {
        return hash.intValue();
    }
}

Source File: FSTCompletionBuilder.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Builds the final automaton from a list of entries.
 */
private FST<Object> buildAutomaton(BytesRefSorter sorter) throws IOException {
  // Build the automaton.
  final Outputs<Object> outputs = NoOutputs.getSingleton();
  final Object empty = outputs.getNoOutput();
  final FSTCompiler<Object> fstCompiler = new FSTCompiler.Builder<>(FST.INPUT_TYPE.BYTE1, outputs)
      .shareMaxTailLength(shareMaxTailLength).build();

  BytesRefBuilder scratch = new BytesRefBuilder();
  BytesRef entry;
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  int count = 0;
  BytesRefIterator iter = sorter.iterator();
  while((entry = iter.next()) != null) {
    count++;
    if (scratch.get().compareTo(entry) != 0) {
      fstCompiler.add(Util.toIntsRef(entry, scratchIntsRef), empty);
      scratch.copyBytes(entry);
    }
  }
  
  return count == 0 ? null : fstCompiler.compile();
}

Source File: ExternalRefSorter.java From lucene-solr with Apache License 2.0

6 votes

@Override
public BytesRefIterator iterator() throws IOException {
  if (sortedFileName == null) {
    closeWriter();
    
    boolean success = false;
    try {
      sortedFileName = sorter.sort(input.getName());
      success = true;
    } finally {
      if (success) {
        sorter.getDirectory().deleteFile(input.getName());
      } else {
        IOUtils.deleteFilesIgnoringExceptions(sorter.getDirectory(), input.getName());
      }
    }
    
    input = null;
  }
  
  return new ByteSequenceIterator(new OfflineSorter.ByteSequencesReader(sorter.getDirectory().openChecksumInput(sortedFileName, IOContext.READONCE), sortedFileName));
}

Source File: Netty4Utils.java From crate with Apache License 2.0

6 votes

/**
 * Turns the given BytesReference into a ByteBuf. Note: the returned ByteBuf will reference the internal
 * pages of the BytesReference. Don't free the bytes of reference before the ByteBuf goes out of scope.
 */
public static ByteBuf toByteBuf(final BytesReference reference) {
    if (reference.length() == 0) {
        return Unpooled.EMPTY_BUFFER;
    }
    if (reference instanceof ByteBufBytesReference) {
        return ((ByteBufBytesReference) reference).toByteBuf();
    } else {
        final BytesRefIterator iterator = reference.iterator();
        // usually we have one, two, or three components from the header, the message, and a buffer
        final List<ByteBuf> buffers = new ArrayList<>(3);
        try {
            BytesRef slice;
            while ((slice = iterator.next()) != null) {
                buffers.add(Unpooled.wrappedBuffer(slice.bytes, slice.offset, slice.length));
            }
            final CompositeByteBuf composite = Unpooled.compositeBuffer(buffers.size());
            composite.addComponents(true, buffers);
            return composite;
        } catch (IOException ex) {
            throw new AssertionError("no IO happens here", ex);
        }
    }
}

Source File: DisjunctionMatchesIterator.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Create a {@link DisjunctionMatchesIterator} over a list of terms extracted from a {@link BytesRefIterator}
 *
 * Only terms that have at least one match in the given document will be included
 */
static MatchesIterator fromTermsEnum(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
  Objects.requireNonNull(field);
  Terms t = context.reader().terms(field);
  if (t == null)
    return null;
  TermsEnum te = t.iterator();
  PostingsEnum reuse = null;
  for (BytesRef term = terms.next(); term != null; term = terms.next()) {
    if (te.seekExact(term)) {
      PostingsEnum pe = te.postings(reuse, PostingsEnum.OFFSETS);
      if (pe.advance(doc) == doc) {
        return new TermsEnumDisjunctionMatchesIterator(new TermMatchesIterator(query, pe), terms, te, doc, query);
      }
      else {
        reuse = pe;
      }
    }
  }
  return null;
}

Source File: BytesRefSortersTest.java From lucene-solr with Apache License 2.0

6 votes

private void check(BytesRefSorter sorter) throws Exception {
  for (int i = 0; i < 100; i++) {
    byte [] current = new byte [random().nextInt(256)];
    random().nextBytes(current);
    sorter.add(new BytesRef(current));
  }

  // Create two iterators and check that they're aligned with each other.
  BytesRefIterator i1 = sorter.iterator();
  BytesRefIterator i2 = sorter.iterator();
  
  // Verify sorter contract.
  expectThrows(IllegalStateException.class, () -> {
    sorter.add(new BytesRef(new byte [1]));
  });

  while (true) {
    BytesRef spare1 = i1.next();
    BytesRef spare2 = i2.next();
    assertEquals(spare1, spare2);
    if (spare1 == null) {
      break;
    }
  }
}

Source File: OrdinalsBuilder.java From Elasticsearch with Apache License 2.0

6 votes

/**
 * This method iterates all terms in the given {@link TermsEnum} and
 * associates each terms ordinal with the terms documents. The caller must
 * exhaust the returned {@link BytesRefIterator} which returns all values
 * where the first returned value is associted with the ordinal <tt>1</tt>
 * etc.
 * <p>
 * If the {@link TermsEnum} contains prefix coded numerical values the terms
 * enum should be wrapped with either {@link #wrapNumeric32Bit(TermsEnum)}
 * or {@link #wrapNumeric64Bit(TermsEnum)} depending on its precision. If
 * the {@link TermsEnum} is not wrapped the returned
 * {@link BytesRefIterator} will contain partial precision terms rather than
 * only full-precision terms.
 * </p>
 */
public BytesRefIterator buildFromTerms(final TermsEnum termsEnum) throws IOException {
    return new BytesRefIterator() {
        private PostingsEnum docsEnum = null;

        @Override
        public BytesRef next() throws IOException {
            BytesRef ref;
            if ((ref = termsEnum.next()) != null) {
                docsEnum = termsEnum.postings(docsEnum, PostingsEnum.NONE);
                nextOrdinal();
                int docId;
                while ((docId = docsEnum.nextDoc()) != DocIdSetIterator.NO_MORE_DOCS) {
                    addDoc(docId);
                }
            }
            return ref;
        }
    };
}

Source File: TestTermsEnumTokenFilter.java From lucene-solr with Apache License 2.0

6 votes

public void testPosIncAttributeOverflow() throws IOException {

    final BytesRef foo = new BytesRef("foo");
    final BytesRef bar = new BytesRef("bar");

    BytesRefIterator terms = new BytesRefIterator() {

      long count = 1000;

      @Override
      public BytesRef next() throws IOException {
        if (count-- > 100)
          return foo;
        if (count-- > 0)
          return bar;
        return null;
      }
    };

    try (TokenStream ts = new LeapfrogTokenFilter(new TermsEnumTokenStream(terms))) {
      while (ts.incrementToken()) {
        // This tight loop will throw an exception if clearAttributes() is not called
        // by TermsEnumTokenStream.  See issue #46
      }
    }
  }

Source File: TestHighFrequencyDictionary.java From lucene-solr with Apache License 2.0

5 votes

public void testEmpty() throws Exception {
  Directory dir = newDirectory();
  IndexWriter writer = new IndexWriter(dir, newIndexWriterConfig(new MockAnalyzer(random())));
  writer.commit();
  writer.close();
  IndexReader ir = DirectoryReader.open(dir);
  Dictionary dictionary = new HighFrequencyDictionary(ir, "bogus", 0.1f);
  BytesRefIterator tf = dictionary.getEntryIterator();
  assertNull(tf.next());
  dir.close();
}

Source File: CompositeBytesReference.java From crate with Apache License 2.0

5 votes

@Override
public BytesRef toBytesRef() {
    BytesRefBuilder builder = new BytesRefBuilder();
    builder.grow(length());
    BytesRef spare;
    BytesRefIterator iterator = iterator();
    try {
        while ((spare = iterator.next()) != null) {
            builder.append(spare);
        }
    } catch (IOException ex) {
        throw new AssertionError("won't happen", ex); // this is really an error since we don't do IO in our bytesreferences
    }
    return builder.toBytesRef();
}

Source File: PagedBytesReference.java From crate with Apache License 2.0

5 votes

@Override
public final BytesRefIterator iterator() {
    final int offset = this.offset;
    final int length = this.length;
    // this iteration is page aligned to ensure we do NOT materialize the pages from the ByteArray
    // we calculate the initial fragment size here to ensure that if this reference is a slice we are still page aligned
    // across the entire iteration. The first page is smaller if our offset != 0 then we start in the middle of the page
    // otherwise we iterate full pages until we reach the last chunk which also might end within a page.
    final int initialFragmentSize = offset != 0 ? PAGE_SIZE - (offset % PAGE_SIZE) : PAGE_SIZE;
    return new BytesRefIterator() {
        int position = 0;
        int nextFragmentSize = Math.min(length, initialFragmentSize);
        // this BytesRef is reused across the iteration on purpose - BytesRefIterator interface was designed for this
        final BytesRef slice = new BytesRef();

        @Override
        public BytesRef next() throws IOException {
            if (nextFragmentSize != 0) {
                final boolean materialized = byteArray.get(offset + position, nextFragmentSize, slice);
                assert materialized == false : "iteration should be page aligned but array got materialized";
                position += nextFragmentSize;
                final int remaining = length - position;
                nextFragmentSize = Math.min(remaining, PAGE_SIZE);
                return slice;
            } else {
                assert nextFragmentSize == 0 : "fragmentSize expected [0] but was: [" + nextFragmentSize + "]";
                return null; // we are done with this iteration
            }
        }
    };
}

Source File: BytesReference.java From crate with Apache License 2.0

5 votes

/**
 * Returns a BytesRefIterator for this BytesReference. This method allows
 * access to the internal pages of this reference without copying them. Use with care!
 * @see BytesRefIterator
 */
public BytesRefIterator iterator() {
    return new BytesRefIterator() {
        BytesRef ref = length() == 0 ? null : toBytesRef();
        @Override
        public BytesRef next() throws IOException {
            BytesRef r = ref;
            ref = null; // only return it once...
            return r;
        }
    };
}

Source File: BytesReference.java From crate with Apache License 2.0

5 votes

/**
 * Writes the bytes directly to the output stream.
 */
public void writeTo(OutputStream os) throws IOException {
    final BytesRefIterator iterator = iterator();
    BytesRef ref;
    while ((ref = iterator.next()) != null) {
        os.write(ref.bytes, ref.offset, ref.length);
    }
}

Source File: BytesReferenceStreamInput.java From crate with Apache License 2.0

5 votes

BytesReferenceStreamInput(BytesRefIterator iterator, final int length) throws IOException {
    this.iterator = iterator;
    this.slice = iterator.next();
    this.length = length;
    this.offset = 0;
    this.sliceIndex = 0;
}

Source File: DisjunctionMatchesIterator.java From lucene-solr with Apache License 2.0

5 votes

TermsEnumDisjunctionMatchesIterator(MatchesIterator first, BytesRefIterator terms, TermsEnum te, int doc, Query query) {
  this.first = first;
  this.terms = terms;
  this.te = te;
  this.doc = doc;
  this.query = query;
}

Source File: DisjunctionMatchesIterator.java From lucene-solr with Apache License 2.0

5 votes

private static BytesRefIterator asBytesRefIterator(List<Term> terms) {
  return new BytesRefIterator() {
    int i = 0;
    @Override
    public BytesRef next() {
      if (i >= terms.size())
        return null;
      return terms.get(i++).bytes();
    }
  };
}

Source File: SpellChecker.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Indexes the data from the given {@link Dictionary}.
 * @param dict Dictionary to index
 * @param config {@link IndexWriterConfig} to use
 * @param fullMerge whether or not the spellcheck index should be fully merged
 * @throws AlreadyClosedException if the Spellchecker is already closed
 * @throws IOException If there is a low-level I/O error.
 */
public final void indexDictionary(Dictionary dict, IndexWriterConfig config, boolean fullMerge) throws IOException {
  synchronized (modifyCurrentIndexLock) {
    ensureOpen();
    final Directory dir = this.spellIndex;
    final IndexWriter writer = new IndexWriter(dir, config);
    IndexSearcher indexSearcher = obtainSearcher();
    final List<TermsEnum> termsEnums = new ArrayList<>();

    final IndexReader reader = searcher.getIndexReader();
    if (reader.maxDoc() > 0) {
      for (final LeafReaderContext ctx : reader.leaves()) {
        Terms terms = ctx.reader().terms(F_WORD);
        if (terms != null)
          termsEnums.add(terms.iterator());
      }
    }
    
    boolean isEmpty = termsEnums.isEmpty();

    try { 
      BytesRefIterator iter = dict.getEntryIterator();
      BytesRef currentTerm;
      
      terms: while ((currentTerm = iter.next()) != null) {

        String word = currentTerm.utf8ToString();
        int len = word.length();
        if (len < 3) {
          continue; // too short we bail but "too long" is fine...
        }

        if (!isEmpty) {
          for (TermsEnum te : termsEnums) {
            if (te.seekExact(currentTerm)) {
              continue terms;
            }
          }
        }

        // ok index the word
        Document doc = createDocument(word, getMin(len), getMax(len));
        writer.addDocument(doc);
      }
    } finally {
      releaseSearcher(indexSearcher);
    }
    if (fullMerge) {
      writer.forceMerge(1);
    }
    // close writer
    writer.close();
    // TODO: this isn't that great, maybe in the future SpellChecker should take
    // IWC in its ctor / keep its writer open?
    
    // also re-open the spell index to see our own changes when the next suggestion
    // is fetched:
    swapSearcher(dir);
  }
}

Source File: InMemorySorter.java From lucene-solr with Apache License 2.0

4 votes

@Override
public BytesRefIterator iterator() {
  closed = true;
  return buffer.iterator(comparator);
}

Source File: BytesRefIteratorTokenStream.java From lucene-solr with Apache License 2.0

4 votes

public BytesRefIteratorTokenStream setBytesRefIterator(BytesRefIterator iter) {
  this.bytesIter = iter;
  return this;
}

Source File: BytesRefIteratorTokenStream.java From lucene-solr with Apache License 2.0

4 votes

public BytesRefIterator getBytesRefIterator() {
  return bytesIter;
}

Source File: AbstractIndexGeoPointFieldData.java From Elasticsearch with Apache License 2.0

4 votes

protected GeoPointTermsEnumLegacy(BytesRefIterator termsEnum) {
    super(termsEnum);
    next = new GeoPoint();
    spare = new CharsRefBuilder();
}

Source File: AbstractIndexGeoPointFieldData.java From Elasticsearch with Apache License 2.0

4 votes

protected GeoPointTermsEnum(BytesRefIterator termsEnum, GeoPointField.TermEncoding termEncoding) {
    super(termsEnum);
    this.termEncoding = termEncoding;
}

Source File: AbstractIndexGeoPointFieldData.java From Elasticsearch with Apache License 2.0

4 votes

protected BaseGeoPointTermsEnum(BytesRefIterator termsEnum) {
    this.termsEnum = termsEnum;
}

Source File: MatchesUtils.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Create a MatchesIterator that is a disjunction over a list of terms extracted from a {@link BytesRefIterator}.
 *
 * Only terms that have at least one match in the given document will be included
 */
public static MatchesIterator disjunction(LeafReaderContext context, int doc, Query query, String field, BytesRefIterator terms) throws IOException {
  return DisjunctionMatchesIterator.fromTermsEnum(context, doc, query, field, terms);
}

Source File: TermsEnumTokenStream.java From lucene-solr with Apache License 2.0

2 votes

/**
 * Create a new TermsEnumTokenStream using a TermsEnum
 *
 * @param termsEnum the TermsEnum to convert
 */
public TermsEnumTokenStream(BytesRefIterator termsEnum) {
  this.termsEnum = termsEnum;
}

Source File: InputIterator.java From lucene-solr with Apache License 2.0

2 votes

/** 
 * Creates a new wrapper, wrapping the specified iterator and 
 * specifying a weight value of <code>1</code> for all terms 
 * and nullifies associated payloads.
 */
public InputIteratorWrapper(BytesRefIterator wrapped) {
  this.wrapped = wrapped;
}

Source File: BytesRefSorter.java From lucene-solr with Apache License 2.0

2 votes

/**
* Sorts the entries added in {@link #add(BytesRef)} and returns 
* an iterator over all sorted entries.
* 
* @throws IOException If an I/O exception occurs.
*/
BytesRefIterator iterator() throws IOException;

org.apache.lucene.util.BytesRefIterator Java Examples