org.apache.lucene.util.BytesRefBuilder#setLength

Source File: SimpleTextUtil.java From lucene-solr with Apache License 2.0

6 votes

public static void readLine(DataInput in, BytesRefBuilder scratch) throws IOException {
  int upto = 0;
  while(true) {
    byte b = in.readByte();
    scratch.grow(1+upto);
    if (b == ESCAPE) {
      scratch.setByteAt(upto++, in.readByte());
    } else {
      if (b == NEWLINE) {
        break;
      } else {
        scratch.setByteAt(upto++, b);
      }
    }
  }
  scratch.setLength(upto);
}

Source File: LegacyNumericUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
 * After encoding, {@code bytes.offset} will always be 0. 
 * @param val the numeric value
 * @param shift how many bits to strip from the right
 * @param bytes will contain the encoded value
 */
public static void longToPrefixCoded(final long val, final int shift, final BytesRefBuilder bytes) {
  // ensure shift is 0..63
  if ((shift & ~0x3f) != 0) {
    throw new IllegalArgumentException("Illegal shift value, must be 0..63; got shift=" + shift);
  }
  int nChars = (((63-shift)*37)>>8) + 1;    // i/7 is the same as (i*37)>>8 for i in 0..63
  bytes.setLength(nChars+1);   // one extra for the byte that contains the shift info
  bytes.grow(BUF_SIZE_LONG);
  bytes.setByteAt(0, (byte)(SHIFT_START_LONG + shift));
  long sortableBits = val ^ 0x8000000000000000L;
  sortableBits >>>= shift;
  while (nChars > 0) {
    // Store 7 bits per byte for compatibility
    // with UTF-8 encoding of terms
    bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f));
    sortableBits >>>= 7;
  }
}

Source File: LegacyNumericUtils.java From lucene-solr with Apache License 2.0

6 votes

/**
 * Returns prefix coded bits after reducing the precision by <code>shift</code> bits.
 * This is method is used by {@link org.apache.solr.legacy.LegacyNumericTokenStream}.
 * After encoding, {@code bytes.offset} will always be 0.
 * @param val the numeric value
 * @param shift how many bits to strip from the right
 * @param bytes will contain the encoded value
 */
public static void intToPrefixCoded(final int val, final int shift, final BytesRefBuilder bytes) {
  // ensure shift is 0..31
  if ((shift & ~0x1f) != 0) {
    throw new IllegalArgumentException("Illegal shift value, must be 0..31; got shift=" + shift);
  }
  int nChars = (((31-shift)*37)>>8) + 1;    // i/7 is the same as (i*37)>>8 for i in 0..63
  bytes.setLength(nChars+1);   // one extra for the byte that contains the shift info
  bytes.grow(LegacyNumericUtils.BUF_SIZE_LONG);  // use the max
  bytes.setByteAt(0, (byte)(SHIFT_START_INT + shift));
  int sortableBits = val ^ 0x80000000;
  sortableBits >>>= shift;
  while (nChars > 0) {
    // Store 7 bits per byte for compatibility
    // with UTF-8 encoding of terms
    bytes.setByteAt(nChars--, (byte)(sortableBits & 0x7f));
    sortableBits >>>= 7;
  }
}

Source File: UTF8TaxonomyWriterCache.java From lucene-solr with Apache License 2.0

5 votes

private BytesRef toBytes(FacetLabel label) {
  BytesRefBuilder bytes = this.bytes.get();
  bytes.clear();
  for (int i = 0; i < label.length; i++) {
    String part = label.components[i];
    if (i > 0) {
      bytes.append(DELIM_CHAR);
    }
    bytes.grow(bytes.length() + UnicodeUtil.maxUTF8Length(part.length()));
    bytes.setLength(UnicodeUtil.UTF16toUTF8(part, 0, part.length(), bytes.bytes(), bytes.length()));
  }
  return bytes.get();
}

Source File: EnumFieldType.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  final String s = val.toString();
  if (s == null)
    return;

  result.grow(Integer.BYTES);
  result.setLength(Integer.BYTES);
  final Integer intValue = enumMapping.stringValueToIntValue(s);
  NumericUtils.intToSortableBytes(intValue, result.bytes(), 0);
}

Source File: EnumFieldType.java From lucene-solr with Apache License 2.0

5 votes

@Override
public String storedToIndexed(IndexableField f) {
  final Number val = f.numericValue();
  if (val == null)
    return null;
  final BytesRefBuilder bytes = new BytesRefBuilder();
  bytes.grow(Integer.BYTES);
  bytes.setLength(Integer.BYTES);
  NumericUtils.intToSortableBytes(val.intValue(), bytes.bytes(), 0);
  return bytes.get().utf8ToString();
}

Source File: DatePointField.java From lucene-solr with Apache License 2.0

5 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  Date date = (Date) toNativeType(val.toString());
  result.grow(Long.BYTES);
  result.setLength(Long.BYTES);
  LongPoint.encodeDimension(date.getTime(), result.bytes(), 0);
}

Source File: SimpleTextFieldsReader.java From lucene-solr with Apache License 2.0

4 votes

private void loadTerms() throws IOException {
  PositiveIntOutputs posIntOutputs = PositiveIntOutputs.getSingleton();
  final FSTCompiler<PairOutputs.Pair<Long,PairOutputs.Pair<Long,Long>>> fstCompiler;
  final PairOutputs<Long,Long> outputsInner = new PairOutputs<>(posIntOutputs, posIntOutputs);
  final PairOutputs<Long,PairOutputs.Pair<Long,Long>> outputs = new PairOutputs<>(posIntOutputs,
      outputsInner);
  fstCompiler = new FSTCompiler<>(FST.INPUT_TYPE.BYTE1, outputs);
  IndexInput in = SimpleTextFieldsReader.this.in.clone();
  in.seek(termsStart);
  final BytesRefBuilder lastTerm = new BytesRefBuilder();
  long lastDocsStart = -1;
  int docFreq = 0;
  long totalTermFreq = 0;
  FixedBitSet visitedDocs = new FixedBitSet(maxDoc);
  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  while(true) {
    SimpleTextUtil.readLine(in, scratch);
    if (scratch.get().equals(END) || StringHelper.startsWith(scratch.get(), FIELD)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef),
            outputs.newPair(lastDocsStart,
                outputsInner.newPair((long) docFreq, totalTermFreq)));
        sumTotalTermFreq += totalTermFreq;
      }
      break;
    } else if (StringHelper.startsWith(scratch.get(), DOC)) {
      docFreq++;
      sumDocFreq++;
      totalTermFreq++;
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), DOC.length, scratch.length()-DOC.length);
      int docID = ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length());
      visitedDocs.set(docID);
    } else if (StringHelper.startsWith(scratch.get(), FREQ)) {
      scratchUTF16.copyUTF8Bytes(scratch.bytes(), FREQ.length, scratch.length()-FREQ.length);
      totalTermFreq += ArrayUtil.parseInt(scratchUTF16.chars(), 0, scratchUTF16.length()) - 1;
    } else if (StringHelper.startsWith(scratch.get(), TERM)) {
      if (lastDocsStart != -1) {
        fstCompiler.add(Util.toIntsRef(lastTerm.get(), scratchIntsRef), outputs.newPair(lastDocsStart,
            outputsInner.newPair((long) docFreq, totalTermFreq)));
      }
      lastDocsStart = in.getFilePointer();
      final int len = scratch.length() - TERM.length;
      lastTerm.grow(len);
      System.arraycopy(scratch.bytes(), TERM.length, lastTerm.bytes(), 0, len);
      lastTerm.setLength(len);
      docFreq = 0;
      sumTotalTermFreq += totalTermFreq;
      totalTermFreq = 0;
      termCount++;
    }
  }
  docCount = visitedDocs.cardinality();
  fst = fstCompiler.compile();
  /*
  PrintStream ps = new PrintStream("out.dot");
  fst.toDot(ps);
  ps.close();
  System.out.println("SAVED out.dot");
  */
  //System.out.println("FST " + fst.sizeInBytes());
}

Source File: SynonymMap.java From lucene-solr with Apache License 2.0

4 votes

/**
 * Builds an {@link SynonymMap} and returns it.
 */
public SynonymMap build() throws IOException {
  ByteSequenceOutputs outputs = ByteSequenceOutputs.getSingleton();
  // TODO: are we using the best sharing options?
  FSTCompiler<BytesRef> fstCompiler =
    new FSTCompiler<>(FST.INPUT_TYPE.BYTE4, outputs);
  
  BytesRefBuilder scratch = new BytesRefBuilder();
  ByteArrayDataOutput scratchOutput = new ByteArrayDataOutput();

  final Set<Integer> dedupSet;

  if (dedup) {
    dedupSet = new HashSet<>();
  } else {
    dedupSet = null;
  }

  final byte[] spare = new byte[5];
  
  Set<CharsRef> keys = workingSet.keySet();
  CharsRef sortedKeys[] = keys.toArray(new CharsRef[keys.size()]);
  Arrays.sort(sortedKeys, CharsRef.getUTF16SortedAsUTF8Comparator());

  final IntsRefBuilder scratchIntsRef = new IntsRefBuilder();
  
  //System.out.println("fmap.build");
  for (int keyIdx = 0; keyIdx < sortedKeys.length; keyIdx++) {
    CharsRef input = sortedKeys[keyIdx];
    MapEntry output = workingSet.get(input);

    int numEntries = output.ords.size();
    // output size, assume the worst case
    int estimatedSize = 5 + numEntries * 5; // numEntries + one ord for each entry
    
    scratch.grow(estimatedSize);
    scratchOutput.reset(scratch.bytes());

    // now write our output data:
    int count = 0;
    for (int i = 0; i < numEntries; i++) {
      if (dedupSet != null) {
        // box once
        final Integer ent = output.ords.get(i);
        if (dedupSet.contains(ent)) {
          continue;
        }
        dedupSet.add(ent);
      }
      scratchOutput.writeVInt(output.ords.get(i));   
      count++;
    }

    final int pos = scratchOutput.getPosition();
    scratchOutput.writeVInt(count << 1 | (output.includeOrig ? 0 : 1));
    final int pos2 = scratchOutput.getPosition();
    final int vIntLen = pos2-pos;

    // Move the count + includeOrig to the front of the byte[]:
    System.arraycopy(scratch.bytes(), pos, spare, 0, vIntLen);
    System.arraycopy(scratch.bytes(), 0, scratch.bytes(), vIntLen, pos);
    System.arraycopy(spare, 0, scratch.bytes(), 0, vIntLen);

    if (dedupSet != null) {
      dedupSet.clear();
    }
    
    scratch.setLength(scratchOutput.getPosition());
    //System.out.println("  add input=" + input + " output=" + scratch + " offset=" + scratch.offset + " length=" + scratch.length + " count=" + count);
    fstCompiler.add(Util.toUTF32(input, scratchIntsRef), scratch.toBytesRef());
  }
  
  FST<BytesRef> fst = fstCompiler.compile();
  return new SynonymMap(fst, words, maxHorizontalContext);
}

Source File: TestLucene80DocValuesFormat.java From lucene-solr with Apache License 2.0

4 votes

@Nightly
public void testSortedSetAroundBlockSize() throws IOException {
  final int frontier = 1 << Lucene80DocValuesFormat.DIRECT_MONOTONIC_BLOCK_SHIFT;
  for (int maxDoc = frontier - 1; maxDoc <= frontier + 1; ++maxDoc) {
    final Directory dir = newDirectory();
    IndexWriter w = new IndexWriter(dir, newIndexWriterConfig().setMergePolicy(newLogMergePolicy()));
    ByteBuffersDataOutput out = new ByteBuffersDataOutput();
    Document doc = new Document();
    SortedSetDocValuesField field1 = new SortedSetDocValuesField("sset", new BytesRef());
    doc.add(field1);
    SortedSetDocValuesField field2 = new SortedSetDocValuesField("sset", new BytesRef());
    doc.add(field2);
    for (int i = 0; i < maxDoc; ++i) {
      BytesRef s1 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
      BytesRef s2 = new BytesRef(TestUtil.randomSimpleString(random(), 2));
      field1.setBytesValue(s1);
      field2.setBytesValue(s2);
      w.addDocument(doc);
      Set<BytesRef> set = new TreeSet<>(Arrays.asList(s1, s2));
      out.writeVInt(set.size());
      for (BytesRef ref : set) {
        out.writeVInt(ref.length);
        out.writeBytes(ref.bytes, ref.offset, ref.length);
      }
    }

    w.forceMerge(1);
    DirectoryReader r = DirectoryReader.open(w);
    w.close();
    LeafReader sr = getOnlyLeafReader(r);
    assertEquals(maxDoc, sr.maxDoc());
    SortedSetDocValues values = sr.getSortedSetDocValues("sset");
    assertNotNull(values);
    ByteBuffersDataInput in = out.toDataInput();
    BytesRefBuilder b = new BytesRefBuilder();
    for (int i = 0; i < maxDoc; ++i) {
      assertEquals(i, values.nextDoc());
      final int numValues = in.readVInt();

      for (int j = 0; j < numValues; ++j) {
        b.setLength(in.readVInt());
        b.grow(b.length());
        in.readBytes(b.bytes(), 0, b.length());
        assertEquals(b.get(), values.lookupOrd(values.nextOrd()));
      }

      assertEquals(SortedSetDocValues.NO_MORE_ORDS, values.nextOrd());
    }
    r.close();
    dir.close();
  }
}

Source File: FloatPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Float.BYTES);
  result.setLength(Float.BYTES);
  FloatPoint.encodeDimension(parseFloatFromUser(null, val.toString()), result.bytes(), 0);
}

Source File: IntPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Integer.BYTES);
  result.setLength(Integer.BYTES);
  IntPoint.encodeDimension(parseIntFromUser(null, val.toString()), result.bytes(), 0);
}

Source File: LongPointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Long.BYTES);
  result.setLength(Long.BYTES);
  LongPoint.encodeDimension(parseLongFromUser(null, val.toString()), result.bytes(), 0);
}

Source File: DoublePointField.java From lucene-solr with Apache License 2.0

4 votes

@Override
public void readableToIndexed(CharSequence val, BytesRefBuilder result) {
  result.grow(Double.BYTES);
  result.setLength(Double.BYTES);
  DoublePoint.encodeDimension(parseDoubleFromUser(null, val.toString()), result.bytes(), 0);
}

Java Code Examples for org.apache.lucene.util.BytesRefBuilder#setLength()