org.apache.parquet.bytes.ByteBufferInputStream Java Exaples

Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0

6 votes

private void readPageV1(DataPageV1 page) {
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL);
  this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
  this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
  int valueCount = page.getValueCount();
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} values", bytes.size(), valueCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(valueCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(valueCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, valueCount);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + path, e);
  }
  newPageInitialized(page);
}

Source File: RunLengthDecoder.java From flink with Apache License 2.0

6 votes

/**
 * Init from input stream.
 */
void initFromStream(int valueCount, ByteBufferInputStream in) throws IOException {
	this.in = in;
	if (fixedWidth) {
		// initialize for repetition and definition levels
		if (readLength) {
			int length = readIntLittleEndian();
			this.in = in.sliceStream(length);
		}
	} else {
		// initialize for values
		if (in.available() > 0) {
			initWidthAndPacker(in.read());
		}
	}
	if (bitWidth == 0) {
		// 0 bit width, treat this as an RLE run of valueCount number of 0's.
		this.mode = MODE.RLE;
		this.currentCount = valueCount;
		this.currentValue = 0;
	} else {
		this.currentCount = 0;
	}
}

Source File: ByteStreamSplitValuesReaderTest.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testSkip() throws Exception {
  byte[] byteData = new byte[16];
  for (int i = 0; i < 16; ++i) {
    byteData[i] = (byte) 0xFF;
  }
  byteData[3] = (byte) 0x00;
  byteData[7] = (byte) 0x00;
  byteData[11] = (byte) 0x10;
  byteData[15] = (byte) 0x40;
  ByteBuffer buffer = ByteBuffer.wrap(byteData);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);

  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(4, stream);
  reader.skip(3);
  float f = reader.readFloat();
  assertEquals(2.25f, f, 0.0f);
}

Source File: AbstractColumnReader.java From flink with Apache License 2.0

6 votes

private void readPageV1(DataPageV1 page) throws IOException {
	this.pageValueCount = page.getValueCount();
	ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);

	// Initialize the decoders.
	if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) {
		throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding());
	}
	int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
	this.runLenDecoder = new RunLengthDecoder(bitWidth);
	try {
		BytesInput bytes = page.getBytes();
		ByteBufferInputStream in = bytes.toInputStream();
		rlReader.initFromPage(pageValueCount, in);
		this.runLenDecoder.initFromStream(pageValueCount, in);
		prepareNewPage(page.getValueEncoding(), in);
	} catch (IOException e) {
		throw new IOException("could not read page " + page + " in col " + descriptor, e);
	}
}

Source File: PageIterator.java From iceberg with Apache License 2.0

6 votes

private void initFromPage(DataPageV1 page) {
  this.triplesCount = page.getValueCount();
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(desc, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(desc, DEFINITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  this.definitionLevels = new ValuesReaderIntIterator(dlReader);
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(triplesCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, page.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + desc, e);
  }
}

Source File: TestDeltaByteArray.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testLengths() throws IOException {
  DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator());
  ValuesReader reader = new DeltaBinaryPackingValuesReader();

  Utils.writeData(writer, values);
  ByteBufferInputStream data = writer.getBytes().toInputStream();
  int[] bin = Utils.readInts(reader, data, values.length);

  // test prefix lengths
  Assert.assertEquals(0, bin[0]);
  Assert.assertEquals(7, bin[1]);
  Assert.assertEquals(7, bin[2]);

  reader = new DeltaBinaryPackingValuesReader();
  bin = Utils.readInts(reader, data, values.length);
  // test suffix lengths
  Assert.assertEquals(10, bin[0]);
  Assert.assertEquals(0, bin[1]);
  Assert.assertEquals(7, bin[2]);
}

Source File: DeltaBinaryPackingValuesReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * eagerly loads all the data into memory
 */
@Override
public void initFromPage(int valueCount, ByteBufferInputStream stream) throws IOException {
  this.in = stream;
  long startPos = in.position();
  this.config = DeltaBinaryPackingConfig.readConfig(in);
  this.totalValueCount = BytesUtils.readUnsignedVarInt(in);
  allocateValuesBuffer();
  bitWidths = new int[config.miniBlockNumInABlock];

  //read first value from header
  valuesBuffer[valuesBuffered++] = BytesUtils.readZigZagVarLong(in);

  while (valuesBuffered < totalValueCount) { //values Buffered could be more than totalValueCount, since we flush on a mini block basis
    loadNewBlockToBuffer();
  }
  updateNextOffset((int) (in.position() - startPos));
}

Source File: BasePageIterator.java From iceberg with Apache License 2.0

6 votes

protected void initFromPage(DataPageV1 initPage) {
  this.triplesCount = initPage.getValueCount();
  ValuesReader rlReader = initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  try {
    BytesInput bytes = initPage.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    initDefinitionLevelsReader(initPage, desc, in, triplesCount);
    LOG.debug("reading data at {}", in.position());
    initDataReader(initPage.getValueEncoding(), in, initPage.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + initPage + " in col " + desc, e);
  }
}

Source File: DeltaBinaryPackingValuesWriterForIntegerTest.java From parquet-mr with Apache License 2.0

6 votes

private void shouldReadAndWrite(int[] data, int length) throws IOException {
  writeData(data, length);
  reader = new DeltaBinaryPackingValuesReader();
  byte[] page = writer.getBytes().toByteArray();
  int miniBlockSize = blockSize / miniBlockNum;

  double miniBlockFlushed = Math.ceil(((double) length - 1) / miniBlockSize);
  double blockFlushed = Math.ceil(((double) length - 1) / blockSize);
  double estimatedSize = 4 * 5 //blockHeader
      + 4 * miniBlockFlushed * miniBlockSize //data(aligned to miniBlock)
      + blockFlushed * miniBlockNum //bitWidth of mini blocks
      + (5.0 * blockFlushed);//min delta for each block
  assertTrue(estimatedSize >= page.length);
  reader.initFromPage(100, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));

  for (int i = 0; i < length; i++) {
    assertEquals(data[i], reader.readInteger());
  }
}

Source File: BitPackingPerfTest.java From parquet-mr with Apache License 2.0

6 votes

private static long readNTimes(byte[] bytes, int[] result, ValuesReader r)
    throws IOException {
  System.out.println();
  long t = 0;
  int N = 10;
  System.gc();
  System.out.print("                                             " + r.getClass().getSimpleName());
  System.out.print(" no gc <");
  for (int k = 0; k < N; k++) {
    long t2 = System.nanoTime();
    r.initFromPage(result.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    for (int i = 0; i < result.length; i++) {
      result[i] = r.readInteger();
    }
    long t3 = System.nanoTime();
    t += t3 - t2;
  }
  System.out.println("> read in " + t/1000 + "µs " + (N * result.length / (t / 1000)) + " values per µs");
  verify(result);
  return t;
}

Source File: BaseVectorizedParquetValuesReader.java From iceberg with Apache License 2.0

6 votes

@Override
public void initFromPage(int valueCount, ByteBufferInputStream in) throws IOException {
  this.inputStream = in;
  if (fixedWidth) {
    // initialize for repetition and definition levels
    if (readLength) {
      int length = readIntLittleEndian();
      this.inputStream = in.sliceStream(length);
    }
  } else {
    // initialize for values
    if (in.available() > 0) {
      init(in.read());
    }
  }
  if (bitWidth == 0) {
    // 0 bit width, treat this as an RLE run of valueCount number of 0's.
    this.mode = Mode.RLE;
    this.currentCount = valueCount;
    this.currentValue = 0;
  } else {
    this.currentCount = 0;
  }
}

Source File: PrimitiveColumnReader.java From presto with Apache License 2.0

6 votes

private ValuesReader initDataReader(ParquetEncoding dataEncoding, int valueCount, ByteBufferInputStream in)
{
    ValuesReader valuesReader;
    if (dataEncoding.usesDictionary()) {
        if (dictionary == null) {
            throw new ParquetDecodingException("Dictionary is missing for Page");
        }
        valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary);
    }
    else {
        valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES);
    }

    try {
        valuesReader.initFromPage(valueCount, in);
        return valuesReader;
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e);
    }
}

Source File: PrimitiveColumnReader.java From presto with Apache License 2.0

6 votes

private ValuesReader readPageV1(DataPageV1 page)
{
    ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL);
    ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL);
    repetitionReader = new LevelValuesReader(rlReader);
    definitionReader = new LevelValuesReader(dlReader);
    try {
        ByteBufferInputStream in = toInputStream(page.getSlice());
        rlReader.initFromPage(page.getValueCount(), in);
        dlReader.initFromPage(page.getValueCount(), in);
        return initDataReader(page.getValueEncoding(), page.getValueCount(), in);
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e);
    }
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testZeroValues() throws IOException {
  FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(100, 100);
  cw.writeInteger(34);
  cw.writeInteger(34);
  getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
  DictionaryValuesReader reader = initDicReader(cw, INT32);

  // pretend there are 100 nulls. what matters is offset = bytes.length.
  ByteBuffer bytes = ByteBuffer.wrap(new byte[] {0x00, 0x01, 0x02, 0x03}); // data doesn't matter
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(bytes);
  stream.skipFully(stream.available());
  reader.initFromPage(100, stream);

  // Testing the deprecated behavior of using byte arrays directly
  reader = initDicReader(cw, INT32);
  int offset = bytes.remaining();
  reader.initFromPage(100,  bytes, offset);
}

Source File: DeltaBinaryPackingValuesWriterForLongTest.java From parquet-mr with Apache License 2.0

6 votes

private void shouldReadAndWrite(long[] data, int length) throws IOException {
  writeData(data, length);
  reader = new DeltaBinaryPackingValuesReader();
  byte[] page = writer.getBytes().toByteArray();
  int miniBlockSize = blockSize / miniBlockNum;

  double miniBlockFlushed = Math.ceil(((double) length - 1) / miniBlockSize);
  double blockFlushed = Math.ceil(((double) length - 1) / blockSize);
  double estimatedSize = 3 * 5 + 1 * 10 //blockHeader, 3 * int + 1 * long
      + 8 * miniBlockFlushed * miniBlockSize //data(aligned to miniBlock)
      + blockFlushed * miniBlockNum //bitWidth of mini blocks
      + (10.0 * blockFlushed);//min delta for each block
  assertTrue(estimatedSize >= page.length);
  reader.initFromPage(100, ByteBufferInputStream.wrap(ByteBuffer.wrap(page)));

  for (int i = 0; i < length; i++) {
    assertEquals(data[i], reader.readLong());
  }
}

Source File: BenchmarkDeltaByteArray.java From parquet-mr with Apache License 2.0

5 votes

@BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 4)
@Test
public void benchmarkSortedStringsWithPlainValuesWriter() throws IOException {
  PlainValuesWriter writer = new PlainValuesWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator());
  BinaryPlainValuesReader reader = new BinaryPlainValuesReader();

  Utils.writeData(writer, sortedVals);
  ByteBufferInputStream data = writer.getBytes().toInputStream();
  Utils.readData(reader, data, values.length);
  System.out.println("size " + data.position());
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

5 votes

private static final ParquetMetadata readFooter(InputFile file, ParquetReadOptions options, SeekableInputStream f, ParquetMetadataConverter converter) throws IOException {
  long fileLen = file.getLength();
  LOG.debug("File length {}", fileLen);
  int FOOTER_LENGTH_SIZE = 4;
  if (fileLen < MAGIC.length + FOOTER_LENGTH_SIZE + MAGIC.length) { // MAGIC + data + footer + footerIndex + MAGIC
    throw new RuntimeException(file.toString() + " is not a Parquet file (too small length: " + fileLen + ")");
  }
  long footerLengthIndex = fileLen - FOOTER_LENGTH_SIZE - MAGIC.length;
  LOG.debug("reading footer index at {}", footerLengthIndex);

  f.seek(footerLengthIndex);
  int footerLength = readIntLittleEndian(f);
  byte[] magic = new byte[MAGIC.length];
  f.readFully(magic);
  if (!Arrays.equals(MAGIC, magic)) {
    throw new RuntimeException(file.toString() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(MAGIC) + " but found " + Arrays.toString(magic));
  }
  long footerIndex = footerLengthIndex - footerLength;
  LOG.debug("read footer length: {}, footer index: {}", footerLength, footerIndex);
  if (footerIndex < MAGIC.length || footerIndex >= footerLengthIndex) {
    throw new RuntimeException("corrupted file: the footer index is not within the file: " + footerIndex);
  }
  f.seek(footerIndex);
  // Read all the footer bytes in one time to avoid multiple read operations,
  // since it can be pretty time consuming for a single read operation in HDFS.
  ByteBuffer footerBytesBuffer = ByteBuffer.allocate(footerLength);
  f.readFully(footerBytesBuffer);
  LOG.debug("Finished to read all footer bytes.");
  footerBytesBuffer.flip();
  InputStream footerBytesStream = ByteBufferInputStream.wrap(footerBytesBuffer);
  return converter.readParquetMetadata(footerBytesStream, options.getMetadataFilter());
}

Source File: TestBitPackingColumn.java From parquet-mr with Apache License 2.0

5 votes

private void validateEncodeDecode(int bitLength, int[] vals, String expected) throws IOException {
  for (PACKING_TYPE type : PACKING_TYPE.values()) {
    LOG.debug("{}", type);
    final int bound = (int)Math.pow(2, bitLength) - 1;
    ValuesWriter w = type.getWriter(bound);
    for (int i : vals) {
      w.writeInteger(i);
    }
    byte[] bytes = w.getBytes().toByteArray();
    LOG.debug("vals ("+bitLength+"): " + TestBitPacking.toString(vals));
    LOG.debug("bytes: {}", TestBitPacking.toString(bytes));
    assertEquals(type.toString(), expected, TestBitPacking.toString(bytes));
    ValuesReader r = type.getReader(bound);
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    int[] result = new int[vals.length];
    for (int i = 0; i < result.length; i++) {
      result[i] = r.readInteger();
    }
    LOG.debug("result: {}", TestBitPacking.toString(result));
    assertArrayEquals(type + " result: " + TestBitPacking.toString(result), vals, result);

    // Test skipping
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    for (int i = 0; i < vals.length; i += 2) {
      assertEquals(vals[i], r.readInteger());
      r.skip();
    }

    // Test n-skipping
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    int skipCount;
    for (int i = 0; i < vals.length; i += skipCount + 1) {
      skipCount = (vals.length - i) / 2;
      assertEquals(vals[i], r.readInteger());
      r.skip(skipCount);
    }
  }
}

Source File: TestCorruptDeltaByteArrays.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testReassemblyWithoutCorruption() throws Exception {
  DeltaByteArrayWriter writer = getDeltaByteArrayWriter();

  for (int i = 0; i < 10; i += 1) {
    writer.writeBytes(Binary.fromString(str(i)));
  }
  ByteBuffer firstPageBytes = writer.getBytes().toByteBuffer();

  writer.reset(); // sets previous to new byte[0]

  for (int i = 10; i < 20; i += 1) {
    writer.writeBytes(Binary.fromString(str(i)));
  }
  ByteBuffer secondPageBytes = writer.getBytes().toByteBuffer();

  DeltaByteArrayReader firstPageReader = new DeltaByteArrayReader();
  firstPageReader.initFromPage(10, ByteBufferInputStream.wrap(firstPageBytes));
  for (int i = 0; i < 10; i += 1) {
    assertEquals(firstPageReader.readBytes().toStringUsingUTF8(), str(i));
  }

  DeltaByteArrayReader secondPageReader = new DeltaByteArrayReader();
  secondPageReader.initFromPage(10, ByteBufferInputStream.wrap(secondPageBytes));
  secondPageReader.setPreviousReader(firstPageReader);

  for (int i = 10; i < 20; i += 1) {
    assertEquals(secondPageReader.readBytes().toStringUsingUTF8(), str(i));
  }
}

Source File: Utils.java From parquet-mr with Apache License 2.0

5 votes

public static int[] readInts(ValuesReader reader, ByteBufferInputStream stream, int length)
    throws IOException {
  int[] ints = new int[length];
  reader.initFromPage(length, stream);
  for(int i=0; i < length; i++) {
    ints[i] = reader.readInteger();
  }
  return ints;
}

Source File: Utils.java From parquet-mr with Apache License 2.0

5 votes

public static Binary[] readData(ValuesReader reader, ByteBufferInputStream stream, int length)
    throws IOException {
  Binary[] bins = new Binary[length];
  reader.initFromPage(length, stream);
  for(int i=0; i < length; i++) {
    bins[i] = reader.readBytes();
  }
  return bins;
}

Source File: TestCorruptDeltaByteArrays.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testOldReassemblyWithoutCorruption() throws Exception {
  DeltaByteArrayWriter writer = getDeltaByteArrayWriter();

  for (int i = 0; i < 10; i += 1) {
    writer.writeBytes(Binary.fromString(str(i)));
  }
  ByteBuffer firstPageBytes = writer.getBytes().toByteBuffer();

  writer.reset(); // sets previous to new byte[0]

  for (int i = 10; i < 20; i += 1) {
    writer.writeBytes(Binary.fromString(str(i)));
  }
  ByteBuffer secondPageBytes = writer.getBytes().toByteBuffer();

  DeltaByteArrayReader firstPageReader = new DeltaByteArrayReader();
  firstPageReader.initFromPage(10, ByteBufferInputStream.wrap(firstPageBytes));
  for (int i = 0; i < 10; i += 1) {
    assertEquals(firstPageReader.readBytes().toStringUsingUTF8(), str(i));
  }

  DeltaByteArrayReader secondPageReader = new DeltaByteArrayReader();
  secondPageReader.initFromPage(10, ByteBufferInputStream.wrap(secondPageBytes));

  for (int i = 10; i < 20; i += 1) {
    assertEquals(secondPageReader.readBytes().toStringUsingUTF8(), str(i));
  }
}

Source File: ByteStreamSplitValuesReaderTest.java From parquet-mr with Apache License 2.0

5 votes

private void testReader(byte[] input, double[] values) throws IOException {
  ByteBuffer buffer = ByteBuffer.wrap(input);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);
  ByteStreamSplitValuesReaderForDouble reader = new ByteStreamSplitValuesReaderForDouble();
  reader.initFromPage(values.length, stream);
  for (double expectedValue : values) {
    double d = reader.readDouble();
    assertEquals(expectedValue, d, 0.0);
  }
}

Source File: ByteStreamSplitValuesReaderTest.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testSkipUnderflow() throws Exception {
  byte[] byteData = new byte[128];
  ByteBuffer buffer = ByteBuffer.wrap(byteData);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);

  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(32, stream);

  try {
    reader.skip(-1);
    Assert.fail("Expected an exception.");
  } catch (ParquetDecodingException ex) {}
}

Source File: ByteStreamSplitValuesReaderTest.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testSkipOverflow() throws Exception {
  byte[] byteData = new byte[128];
  ByteBuffer buffer = ByteBuffer.wrap(byteData);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);

  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(32, stream);

  try {
    reader.skip(33);
    Assert.fail("Expected an exception.");
  } catch (ParquetDecodingException ex) {}
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testSkipInBinaryDictionary() throws Exception {
  ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
  writeRepeated(100, cw, "a");
  writeDistinct(100, cw, "b");
  assertEquals(PLAIN_DICTIONARY, cw.getEncoding());

  // Test skip and skip-n with dictionary encoding
  ByteBufferInputStream stream = cw.getBytes().toInputStream();
  DictionaryValuesReader cr = initDicReader(cw, BINARY);
  cr.initFromPage(200, stream);
  for (int i = 0; i < 100; i += 2) {
    assertEquals(Binary.fromString("a" + i % 10), cr.readBytes());
    cr.skip();
  }
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(Binary.fromString("b" + i), cr.readBytes());
    cr.skip(skipCount);
  }

  // Ensure fallback
  writeDistinct(1000, cw, "c");
  assertEquals(PLAIN, cw.getEncoding());

  // Test skip and skip-n with plain encoding (after fallback)
  ValuesReader plainReader = new BinaryPlainValuesReader();
  plainReader.initFromPage(1200, cw.getBytes().toInputStream());
  plainReader.skip(200);
  for (int i = 0; i < 100; i += 2) {
    assertEquals("c" + i, plainReader.readBytes().toStringUsingUTF8());
    plainReader.skip();
  }
  for (int i = 100; i < 1000; i += skipCount + 1) {
    skipCount = (1000 - i) / 2;
    assertEquals(Binary.fromString("c" + i), plainReader.readBytes());
    plainReader.skip(skipCount);
  }
}

Source File: BenchmarkDeltaLengthByteArray.java From parquet-mr with Apache License 2.0

5 votes

@BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 4)
@Test
public void benchmarkRandomStringsWithPlainValuesWriter() throws IOException {
  PlainValuesWriter writer = new PlainValuesWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator());
  BinaryPlainValuesReader reader = new BinaryPlainValuesReader();

  Utils.writeData(writer, values);
  ByteBufferInputStream data = writer.getBytes().toInputStream();
  Utils.readData(reader, data, values.length);
  System.out.println("size " + data.position());
}

Source File: ByteStreamSplitValuesReaderTest.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testExtraReads() throws Exception {
  byte[] byteData = {(byte) 0x00, (byte) 0x00, (byte) 0x10, (byte) 0x40};
  ByteBuffer buffer = ByteBuffer.wrap(byteData);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);

  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(1, stream);
  float f = reader.readFloat();
  assertEquals(2.25f, f, 0.0f);
  try {
    reader.readFloat();
    Assert.fail("Expected an exception.");
  } catch (ParquetDecodingException ex) {}
}

Source File: ByteStreamSplitValuesReaderTest.java From parquet-mr with Apache License 2.0

5 votes

private void testReader(byte[] input, float[] values) throws IOException {
  ByteBuffer buffer = ByteBuffer.wrap(input);
  ByteBufferInputStream stream = ByteBufferInputStream.wrap(buffer);
  ByteStreamSplitValuesReaderForFloat reader = new ByteStreamSplitValuesReaderForFloat();
  reader.initFromPage(values.length, stream);
  for (float expectedValue : values) {
    float f = reader.readFloat();
    assertEquals(expectedValue, f, 0.0f);
  }
}

Source File: BenchmarkDeltaLengthByteArray.java From parquet-mr with Apache License 2.0

5 votes

@BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 4)
@Test
public void benchmarkRandomStringsWithDeltaLengthByteArrayValuesWriter() throws IOException {
  DeltaLengthByteArrayValuesWriter writer = new DeltaLengthByteArrayValuesWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator());
  DeltaLengthByteArrayValuesReader reader = new DeltaLengthByteArrayValuesReader();

  Utils.writeData(writer, values);
  ByteBufferInputStream data = writer.getBytes().toInputStream();
  Utils.readData(reader, data, values.length);
  System.out.println("size " + data.position());
}

org.apache.parquet.bytes.ByteBufferInputStream Java Examples