org.apache.parquet.column.values.ValuesReader Java Exaples

Source File: BitPackingPerfTest.java From parquet-mr with Apache License 2.0

6 votes

private static long readNTimes(byte[] bytes, int[] result, ValuesReader r)
    throws IOException {
  System.out.println();
  long t = 0;
  int N = 10;
  System.gc();
  System.out.print("                                             " + r.getClass().getSimpleName());
  System.out.print(" no gc <");
  for (int k = 0; k < N; k++) {
    long t2 = System.nanoTime();
    r.initFromPage(result.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    for (int i = 0; i < result.length; i++) {
      result[i] = r.readInteger();
    }
    long t3 = System.nanoTime();
    t += t3 - t2;
  }
  System.out.println("> read in " + t/1000 + "µs " + (N * result.length / (t / 1000)) + " values per µs");
  verify(result);
  return t;
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFloatDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw = newPlainFloatDictionaryValuesWriter(maxDictionaryByteSize, slabSize);

  // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new PlainValuesReader.FloatPlainValuesReader();

  roundTripFloat(cw, reader, maxDictionaryByteSize);
  //simulate cutting the page
  cw.reset();
  assertEquals(0,cw.getBufferedSize());
  cw.resetDictionary();

  roundTripFloat(cw, reader, maxDictionaryByteSize);
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testIntDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw = newPlainIntegerDictionaryValuesWriter(maxDictionaryByteSize, slabSize);

  // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new PlainValuesReader.IntegerPlainValuesReader();

  roundTripInt(cw, reader, maxDictionaryByteSize);
  //simulate cutting the page
  cw.reset();
  assertEquals(0,cw.getBufferedSize());
  cw.resetDictionary();

  roundTripInt(cw, reader, maxDictionaryByteSize);
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testDoubleDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw = newPlainDoubleDictionaryValuesWriter(maxDictionaryByteSize, slabSize);

  // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new PlainValuesReader.DoublePlainValuesReader();

  roundTripDouble(cw, reader, maxDictionaryByteSize);
  //simulate cutting the page
  cw.reset();
  assertEquals(0,cw.getBufferedSize());
  cw.resetDictionary();

  roundTripDouble(cw, reader, maxDictionaryByteSize);
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testLongDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw = newPlainLongDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
  // Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new PlainValuesReader.LongPlainValuesReader();

  roundTripLong(cw, reader, maxDictionaryByteSize);
  //simulate cutting the page
  cw.reset();
  assertEquals(0,cw.getBufferedSize());
  cw.resetDictionary();

  roundTripLong(cw, reader, maxDictionaryByteSize);
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testSecondPageFallBack() throws IOException {
  int COUNT = 1000;
  ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
  writeRepeated(COUNT, cw, "a");
  BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN_DICTIONARY);
  writeDistinct(COUNT, cw, "b");
  // not efficient so falls back
  BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);
  writeRepeated(COUNT, cw, "a");
  // still plain because we fell back on previous page
  BytesInput bytes3 = getBytesAndCheckEncoding(cw, PLAIN);

  ValuesReader cr = initDicReader(cw, BINARY);
  checkRepeated(COUNT, bytes1, cr, "a");
  cr = new BinaryPlainValuesReader();
  checkDistinct(COUNT, bytes2, cr, "b");
  checkRepeated(COUNT, bytes3, cr, "a");
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testFirstPageFallBack() throws IOException {
  int COUNT = 1000;
  ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(10000, 10000);
  writeDistinct(COUNT, cw, "a");
  // not efficient so falls back
  BytesInput bytes1 = getBytesAndCheckEncoding(cw, PLAIN);
  writeRepeated(COUNT, cw, "b");
  // still plain because we fell back on first page
  BytesInput bytes2 = getBytesAndCheckEncoding(cw, PLAIN);

  ValuesReader cr = new BinaryPlainValuesReader();
  checkDistinct(COUNT, bytes1, cr, "a");
  checkRepeated(COUNT, bytes2, cr, "b");

}

Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0

6 votes

private void readPageV1(DataPageV1 page) {
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL);
  this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
  this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
  int valueCount = page.getValueCount();
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} values", bytes.size(), valueCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(valueCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(valueCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, valueCount);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + path, e);
  }
  newPageInitialized(page);
}

Source File: AbstractColumnReader.java From flink with Apache License 2.0

6 votes

private void readPageV1(DataPageV1 page) throws IOException {
	this.pageValueCount = page.getValueCount();
	ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL);

	// Initialize the decoders.
	if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) {
		throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding());
	}
	int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel());
	this.runLenDecoder = new RunLengthDecoder(bitWidth);
	try {
		BytesInput bytes = page.getBytes();
		ByteBufferInputStream in = bytes.toInputStream();
		rlReader.initFromPage(pageValueCount, in);
		this.runLenDecoder.initFromStream(pageValueCount, in);
		prepareNewPage(page.getValueEncoding(), in);
	} catch (IOException e) {
		throw new IOException("could not read page " + page + " in col " + descriptor, e);
	}
}

Source File: PageIterator.java From iceberg with Apache License 2.0

6 votes

private void initFromPage(DataPageV1 page) {
  this.triplesCount = page.getValueCount();
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(desc, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(desc, DEFINITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  this.definitionLevels = new ValuesReaderIntIterator(dlReader);
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(triplesCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, page.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + desc, e);
  }
}

Source File: PageIterator.java From iceberg with Apache License 2.0

6 votes

public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  advance();
}

Source File: BasePageIterator.java From iceberg with Apache License 2.0

6 votes

public void setPage(DataPage page) {
  Preconditions.checkNotNull(page, "Cannot read from null page");
  this.page = page;
  this.page.accept(new DataPage.Visitor<ValuesReader>() {
    @Override
    public ValuesReader visit(DataPageV1 dataPageV1) {
      initFromPage(dataPageV1);
      return null;
    }

    @Override
    public ValuesReader visit(DataPageV2 dataPageV2) {
      initFromPage(dataPageV2);
      return null;
    }
  });
  this.triplesRead = 0;
  this.hasNext = triplesRead < triplesCount;
}

Source File: PrimitiveColumnReader.java From presto with Apache License 2.0

6 votes

private ValuesReader readPageV1(DataPageV1 page)
{
    ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL);
    ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL);
    repetitionReader = new LevelValuesReader(rlReader);
    definitionReader = new LevelValuesReader(dlReader);
    try {
        ByteBufferInputStream in = toInputStream(page.getSlice());
        rlReader.initFromPage(page.getValueCount(), in);
        dlReader.initFromPage(page.getValueCount(), in);
        return initDataReader(page.getValueEncoding(), page.getValueCount(), in);
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e);
    }
}

Source File: TestDeltaByteArray.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testLengths() throws IOException {
  DeltaByteArrayWriter writer = new DeltaByteArrayWriter(64 * 1024, 64 * 1024, new DirectByteBufferAllocator());
  ValuesReader reader = new DeltaBinaryPackingValuesReader();

  Utils.writeData(writer, values);
  ByteBufferInputStream data = writer.getBytes().toInputStream();
  int[] bin = Utils.readInts(reader, data, values.length);

  // test prefix lengths
  Assert.assertEquals(0, bin[0]);
  Assert.assertEquals(7, bin[1]);
  Assert.assertEquals(7, bin[2]);

  reader = new DeltaBinaryPackingValuesReader();
  bin = Utils.readInts(reader, data, values.length);
  // test suffix lengths
  Assert.assertEquals(10, bin[0]);
  Assert.assertEquals(0, bin[1]);
  Assert.assertEquals(7, bin[2]);
}

Source File: BasePageIterator.java From iceberg with Apache License 2.0

6 votes

protected void initFromPage(DataPageV1 initPage) {
  this.triplesCount = initPage.getValueCount();
  ValuesReader rlReader = initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL);
  this.repetitionLevels = new ValuesReaderIntIterator(rlReader);
  try {
    BytesInput bytes = initPage.getBytes();
    LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(triplesCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    initDefinitionLevelsReader(initPage, desc, in, triplesCount);
    LOG.debug("reading data at {}", in.position());
    initDataReader(initPage.getValueEncoding(), in, initPage.getValueCount());
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + initPage + " in col " + desc, e);
  }
}

Source File: PrimitiveColumnReader.java From presto with Apache License 2.0

6 votes

private ValuesReader initDataReader(ParquetEncoding dataEncoding, int valueCount, ByteBufferInputStream in)
{
    ValuesReader valuesReader;
    if (dataEncoding.usesDictionary()) {
        if (dictionary == null) {
            throw new ParquetDecodingException("Dictionary is missing for Page");
        }
        valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary);
    }
    else {
        valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES);
    }

    try {
        valuesReader.initFromPage(valueCount, in);
        return valuesReader;
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e);
    }
}

Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0

5 votes

private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
  ValuesReader previousReader = this.dataColumn;

  this.currentEncoding = dataEncoding;
  this.pageValueCount = valueCount;
  this.endOfPageValueCount = readValues + pageValueCount;

  if (dataEncoding.usesDictionary()) {
    if (dictionary == null) {
      throw new ParquetDecodingException(
          "could not read page in col " + path + " as the dictionary was missing for encoding " + dataEncoding);
    }
    this.dataColumn = dataEncoding.getDictionaryBasedValuesReader(path, VALUES, dictionary);
  } else {
    this.dataColumn = dataEncoding.getValuesReader(path, VALUES);
  }

  if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
    bindToDictionary(dictionary);
  } else {
    bind(path.getType());
  }

  try {
    dataColumn.initFromPage(pageValueCount, in);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page in col " + path, e);
  }

  if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
      previousReader != null && previousReader instanceof RequiresPreviousReader) {
    // previous reader can only be set if reading sequentially
    ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader);
  }
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

5 votes

private void roundTripInt(FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw,  ValuesReader reader, int maxDictionaryByteSize) throws IOException {
  int fallBackThreshold = maxDictionaryByteSize / 4;
  for (int i = 0; i < 100; i++) {
    cw.writeInteger(i);
    if (i < fallBackThreshold) {
      assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
    } else {
      assertEquals(cw.getEncoding(), PLAIN);
    }
  }

  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (int i = 0; i < 100; i++) {
    assertEquals(i, reader.readInteger());
  }

  // Test skip with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  for (int i = 0; i < 100; i += 2) {
    assertEquals(i, reader.readInteger());
    reader.skip();
  }

  // Test skip-n with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(i, reader.readInteger());
    reader.skip(skipCount);
  }
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

5 votes

private void roundTripDouble(FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw,  ValuesReader reader, int maxDictionaryByteSize) throws IOException {
  int fallBackThreshold = maxDictionaryByteSize / 8;
  for (double i = 0; i < 100; i++) {
    cw.writeDouble(i);
    if (i < fallBackThreshold) {
      assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
    } else {
      assertEquals(cw.getEncoding(), PLAIN);
    }
  }

  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (double i = 0; i < 100; i++) {
    assertEquals(i, reader.readDouble(), 0.00001);
  }

  // Test skip with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  for (int i = 0; i < 100; i += 2) {
    assertEquals(i, reader.readDouble(), 0.0);
    reader.skip();
  }

  // Test skip-n with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(i, reader.readDouble(), 0.0);
    reader.skip(skipCount);
  }
}

Source File: TestBitPackingColumn.java From parquet-mr with Apache License 2.0

5 votes

private void validateEncodeDecode(int bitLength, int[] vals, String expected) throws IOException {
  for (PACKING_TYPE type : PACKING_TYPE.values()) {
    LOG.debug("{}", type);
    final int bound = (int)Math.pow(2, bitLength) - 1;
    ValuesWriter w = type.getWriter(bound);
    for (int i : vals) {
      w.writeInteger(i);
    }
    byte[] bytes = w.getBytes().toByteArray();
    LOG.debug("vals ("+bitLength+"): " + TestBitPacking.toString(vals));
    LOG.debug("bytes: {}", TestBitPacking.toString(bytes));
    assertEquals(type.toString(), expected, TestBitPacking.toString(bytes));
    ValuesReader r = type.getReader(bound);
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    int[] result = new int[vals.length];
    for (int i = 0; i < result.length; i++) {
      result[i] = r.readInteger();
    }
    LOG.debug("result: {}", TestBitPacking.toString(result));
    assertArrayEquals(type + " result: " + TestBitPacking.toString(result), vals, result);

    // Test skipping
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    for (int i = 0; i < vals.length; i += 2) {
      assertEquals(vals[i], r.readInteger());
      r.skip();
    }

    // Test n-skipping
    r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes)));
    int skipCount;
    for (int i = 0; i < vals.length; i += skipCount + 1) {
      skipCount = (vals.length - i) / 2;
      assertEquals(vals[i], r.readInteger());
      r.skip(skipCount);
    }
  }
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

5 votes

private void roundTripLong(FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw,  ValuesReader reader, int maxDictionaryByteSize) throws IOException {
  int fallBackThreshold = maxDictionaryByteSize / 8;
  for (long i = 0; i < 100; i++) {
    cw.writeLong(i);
    if (i < fallBackThreshold) {
      assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
    } else {
      assertEquals(cw.getEncoding(), PLAIN);
    }
  }

  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (long i = 0; i < 100; i++) {
    assertEquals(i, reader.readLong());
  }

  // Test skip with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  for (int i = 0; i < 100; i += 2) {
    assertEquals(i, reader.readLong());
    reader.skip();
  }

  // Test skip-n with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(i, reader.readLong());
    reader.skip(skipCount);
  }
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testBinaryDictionaryFallBack() throws IOException {
  int slabSize = 100;
  int maxDictionaryByteSize = 50;
  final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, slabSize);
  int fallBackThreshold = maxDictionaryByteSize;
  int dataSize=0;
  for (long i = 0; i < 100; i++) {
    Binary binary = Binary.fromString("str" + i);
    cw.writeBytes(binary);
    dataSize += (binary.length() + 4);
    if (dataSize < fallBackThreshold) {
      assertEquals(PLAIN_DICTIONARY, cw.getEncoding());
    } else {
      assertEquals(PLAIN, cw.getEncoding());
    }
  }

  //Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back
  ValuesReader reader = new BinaryPlainValuesReader();
  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (long i = 0; i < 100; i++) {
    assertEquals(Binary.fromString("str" + i), reader.readBytes());
  }

  //simulate cutting the page
  cw.reset();
  assertEquals(0, cw.getBufferedSize());
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testSkipInBinaryDictionary() throws Exception {
  ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000);
  writeRepeated(100, cw, "a");
  writeDistinct(100, cw, "b");
  assertEquals(PLAIN_DICTIONARY, cw.getEncoding());

  // Test skip and skip-n with dictionary encoding
  ByteBufferInputStream stream = cw.getBytes().toInputStream();
  DictionaryValuesReader cr = initDicReader(cw, BINARY);
  cr.initFromPage(200, stream);
  for (int i = 0; i < 100; i += 2) {
    assertEquals(Binary.fromString("a" + i % 10), cr.readBytes());
    cr.skip();
  }
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(Binary.fromString("b" + i), cr.readBytes());
    cr.skip(skipCount);
  }

  // Ensure fallback
  writeDistinct(1000, cw, "c");
  assertEquals(PLAIN, cw.getEncoding());

  // Test skip and skip-n with plain encoding (after fallback)
  ValuesReader plainReader = new BinaryPlainValuesReader();
  plainReader.initFromPage(1200, cw.getBytes().toInputStream());
  plainReader.skip(200);
  for (int i = 0; i < 100; i += 2) {
    assertEquals("c" + i, plainReader.readBytes().toStringUsingUTF8());
    plainReader.skip();
  }
  for (int i = 100; i < 1000; i += skipCount + 1) {
    skipCount = (1000 - i) / 2;
    assertEquals(Binary.fromString("c" + i), plainReader.readBytes());
    plainReader.skip(skipCount);
  }
}

Source File: TestDictionary.java From parquet-mr with Apache License 2.0

5 votes

private void roundTripFloat(FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw,  ValuesReader reader, int maxDictionaryByteSize) throws IOException {
  int fallBackThreshold = maxDictionaryByteSize / 4;
  for (float i = 0; i < 100; i++) {
    cw.writeFloat(i);
    if (i < fallBackThreshold) {
      assertEquals(cw.getEncoding(), PLAIN_DICTIONARY);
    } else {
      assertEquals(cw.getEncoding(), PLAIN);
    }
  }

  reader.initFromPage(100, cw.getBytes().toInputStream());

  for (float i = 0; i < 100; i++) {
    assertEquals(i, reader.readFloat(), 0.00001);
  }

  // Test skip with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  for (int i = 0; i < 100; i += 2) {
    assertEquals(i, reader.readFloat(), 0.0f);
    reader.skip();
  }

  // Test skip-n with plain encoding
  reader.initFromPage(100, cw.getBytes().toInputStream());
  int skipCount;
  for (int i = 0; i < 100; i += skipCount + 1) {
    skipCount = (100 - i) / 2;
    assertEquals(i, reader.readFloat(), 0.0f);
    reader.skip(skipCount);
  }
}

Source File: TestDeltaLengthByteArray.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testLengths() throws IOException {
  DeltaLengthByteArrayValuesWriter writer = getDeltaLengthByteArrayValuesWriter();
  ValuesReader reader = new DeltaBinaryPackingValuesReader();

  Utils.writeData(writer, values);
  int[] bin = Utils.readInts(reader, writer.getBytes().toInputStream(), values.length);

  for(int i =0; i< bin.length ; i++) {
    Assert.assertEquals(values[i].length(), bin[i]);
  }
}

Source File: VectorizedPageIterator.java From iceberg with Apache License 2.0

5 votes

@Override
protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
  ValuesReader previousReader = plainValuesReader;
  if (dataEncoding.usesDictionary()) {
    if (dictionary == null) {
      throw new ParquetDecodingException(
          "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
    }
    try {
      dictionaryEncodedValuesReader =
          new VectorizedDictionaryEncodedParquetValuesReader(desc.getMaxDefinitionLevel(), setArrowValidityVector);
      dictionaryEncodedValuesReader.initFromPage(valueCount, in);
      if (ParquetUtil.isIntType(desc.getPrimitiveType()) || !allPagesDictEncoded) {
        dictionaryDecodeMode = DictionaryDecodeMode.EAGER;
      } else {
        dictionaryDecodeMode = DictionaryDecodeMode.LAZY;
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }
  } else {
    plainValuesReader = new ValuesAsBytesReader();
    plainValuesReader.initFromPage(valueCount, in);
    dictionaryDecodeMode = DictionaryDecodeMode.NONE;
  }
  if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
      previousReader != null && previousReader instanceof RequiresPreviousReader) {
    // previous reader can only be set if reading sequentially
    ((RequiresPreviousReader) plainValuesReader).setPreviousReader(previousReader);
  }
}

Source File: PageIterator.java From iceberg with Apache License 2.0

5 votes

private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
    ValuesReader previousReader = values;

    this.valueEncoding = dataEncoding;

    // TODO: May want to change this so that this class is not dictionary-aware.
    // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries
    // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader
    if (dataEncoding.usesDictionary()) {
      if (dict == null) {
        throw new ParquetDecodingException(
            "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
      }
      this.values = dataEncoding.getDictionaryBasedValuesReader(desc, VALUES, dict);
    } else {
      this.values = dataEncoding.getValuesReader(desc, VALUES);
    }

//    if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
//      bindToDictionary(dictionary);
//    } else {
//      bind(path.getType());
//    }

    try {
      values.initFromPage(valueCount, in);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }

    if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
        previousReader != null && previousReader instanceof RequiresPreviousReader) {
      // previous reader can only be set if reading sequentially
      ((RequiresPreviousReader) values).setPreviousReader(previousReader);
    }
  }

Source File: PageIterator.java From iceberg with Apache License 2.0

5 votes

@Override
protected void initDefinitionLevelsReader(DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in,
                                          int triplesCount) throws IOException {
  ValuesReader dlReader = dataPageV1.getDlEncoding().getValuesReader(desc, ValuesType.DEFINITION_LEVEL);
  this.definitionLevels = new ValuesReaderIntIterator(dlReader);
  dlReader.initFromPage(triplesCount, in);
}

Source File: PageIterator.java From iceberg with Apache License 2.0

5 votes

@Override
  protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) {
    ValuesReader previousReader = values;

    this.valueEncoding = dataEncoding;

    // TODO: May want to change this so that this class is not dictionary-aware.
    // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries
    // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader
    if (dataEncoding.usesDictionary()) {
      if (dictionary == null) {
        throw new ParquetDecodingException(
            "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding);
      }
      this.values = dataEncoding.getDictionaryBasedValuesReader(desc, ValuesType.VALUES, dictionary);
    } else {
      this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES);
    }

//    if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) {
//      bindToDictionary(dictionary);
//    } else {
//      bind(path.getType());
//    }

    try {
      values.initFromPage(valueCount, in);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not read page in col " + desc, e);
    }

    if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) &&
        previousReader instanceof RequiresPreviousReader) {
      // previous reader can only be set if reading sequentially
      ((RequiresPreviousReader) values).setPreviousReader(previousReader);
    }
  }

Source File: BenchmarkReadingRandomIntegers.java From parquet-mr with Apache License 2.0

5 votes

@BenchmarkOptions(benchmarkRounds = 20, warmupRounds = 10)
@Test
public void readingRLE() throws IOException {
  for (int j = 0; j < 10; j++) {

    ValuesReader reader = new RunLengthBitPackingHybridValuesReader(32);
    readData(reader, rleBytes);
  }
}

org.apache.parquet.column.values.ValuesReader Java Examples