org.apache.parquet.column.page.PageReader Java Exaples

Source File: TestMemPageStore.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void test() throws IOException {
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2);
  LongStatistics stats = new LongStatistics();
  PageWriter pageWriter = memPageStore.getPageWriter(col);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  PageReader pageReader = memPageStore.getPageReader(col);
  long totalValueCount = pageReader.getTotalValueCount();
  System.out.println(totalValueCount);
  int total = 0;
  do {
    DataPage readPage = pageReader.readPage();
    total += readPage.getValueCount();
    System.out.println(readPage);
    // TODO: assert
  } while (total < totalValueCount);
}

Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0

6 votes

/**
 * creates a reader for triplets
 * @param path the descriptor for the corresponding column
 * @param pageReader the underlying store to read from
 * @param converter a converter that materializes the values in this column in the current record
 * @param writerVersion writer version string from the Parquet file being read
 */
ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.pageReader = Objects.requireNonNull(pageReader, "pageReader cannot be null");
  this.converter = Objects.requireNonNull(converter, "converter cannot be null");
  this.writerVersion = writerVersion;
  this.maxDefinitionLevel = path.getMaxDefinitionLevel();
  DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
      if (converter.hasDictionarySupport()) {
        converter.setDictionary(dictionary);
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
    }
  } else {
    this.dictionary = null;
  }
  this.totalValueCount = pageReader.getTotalValueCount();
  if (totalValueCount <= 0) {
    throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
  }
}

Source File: VectorizedColumnIterator.java From iceberg with Apache License 2.0

5 votes

public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) {
  // setPageSource can result in a data page read. If that happens, we need
  // to know in advance whether all the pages in the row group are dictionary encoded or not
  this.vectorizedPageIterator.setAllPagesDictEncoded(allPagesDictEncoded);
  super.setPageSource(store);
  return dictionary;
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

5 votes

public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}

Source File: TestParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

private void validateV2Page(MessageType schema, PageReadStore pages, String[] path, int values, int rows, int nullCount,
                            byte[] repetition, byte[] definition, byte[] data, int uncompressedSize) throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV2 page = (DataPageV2)pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertEquals(rows, page.getRowCount());
  assertEquals(nullCount, page.getNullCount());
  assertEquals(uncompressedSize, page.getUncompressedSize());
  assertArrayEquals(repetition, page.getRepetitionLevels().toByteArray());
  assertArrayEquals(definition, page.getDefinitionLevels().toByteArray());
  assertArrayEquals(data, page.getData().toByteArray());
}

Source File: ColumnChunkPageReadStore.java From parquet-mr with Apache License 2.0

5 votes

@Override
public PageReader getPageReader(ColumnDescriptor path) {
  final PageReader pageReader = readers.get(path);
  if (pageReader == null) {
    throw new IllegalArgumentException(path + " is not in the store: " + readers.keySet() + " " + rowCount);
  }
  return pageReader;
}

Source File: MemPageStore.java From parquet-mr with Apache License 2.0

5 votes

@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  MemPageWriter pageWriter = pageWriters.get(descriptor);
  if (pageWriter == null) {
    throw new UnknownColumnException(descriptor);
  }
  List<DataPage> pages = new ArrayList<>(pageWriter.getPages());
  LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size());
  return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage());
}

Source File: SynchronizingColumnReader.java From parquet-mr with Apache License 2.0

5 votes

SynchronizingColumnReader(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter,
    ParsedVersion writerVersion, PrimitiveIterator.OfLong rowIndexes) {
  super(path, pageReader, converter, writerVersion);
  this.rowIndexes = rowIndexes;
  targetRow = Long.MIN_VALUE;
  consume();
}

Source File: ColumnReadStoreImpl.java From parquet-mr with Apache License 2.0

5 votes

@Override
public ColumnReader getColumnReader(ColumnDescriptor path) {
  PrimitiveConverter converter = getPrimitiveConverter(path);
  PageReader pageReader = pageReadStore.getPageReader(path);
  Optional<PrimitiveIterator.OfLong> rowIndexes = pageReadStore.getRowIndexes();
  if (rowIndexes.isPresent()) {
    return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, rowIndexes.get());
  } else {
    return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
  }
}

Source File: TimestampColumnReader.java From flink with Apache License 2.0

5 votes

public TimestampColumnReader(
		boolean utcTimestamp,
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	this.utcTimestamp = utcTimestamp;
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT96);
}

Source File: FixedLenBytesColumnReader.java From flink with Apache License 2.0

5 votes

public FixedLenBytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader,
		int precision) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY);
	this.precision = precision;
}

Source File: BaseColumnIterator.java From iceberg with Apache License 2.0

5 votes

public void setPageSource(PageReader source) {
  this.pageSource = source;
  this.triplesCount = source.getTotalValueCount();
  this.triplesRead = 0L;
  this.advanceNextPageCount = 0L;
  BasePageIterator pageIterator = pageIterator();
  pageIterator.reset();
  dictionary = ParquetUtil.readDictionary(desc, pageSource);
  pageIterator.setDictionary(dictionary);
  advance();
}

Source File: ParquetUtil.java From iceberg with Apache License 2.0

5 votes

public static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
  DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
    }
  }
  return null;
}

Source File: ColumnIterator.java From iceberg with Apache License 2.0

5 votes

public void setPageSource(PageReader source) {
  this.pageSource = source;
  this.triplesCount = source.getTotalValueCount();
  this.triplesRead = 0L;
  this.advanceNextPageCount = 0L;
  this.pageIterator.reset();
  this.pageIterator.setDictionary(readDictionary(desc, pageSource));
  advance();
}

Source File: ColumnIterator.java From iceberg with Apache License 2.0

5 votes

private static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
    DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
    if (dictionaryPage != null) {
      try {
        return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
//        if (converter.hasDictionarySupport()) {
//          converter.setDictionary(dictionary);
//        }
      } catch (IOException e) {
        throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
      }
    }
    return null;
  }

Source File: ParquetRecordReaderTest.java From dremio-oss with Apache License 2.0

5 votes

private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes)
    throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV1 page = (DataPageV1) pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray());
}

Source File: AbstractColumnReader.java From flink with Apache License 2.0

5 votes

public AbstractColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	this.descriptor = descriptor;
	this.pageReader = pageReader;
	this.maxDefLevel = descriptor.getMaxDefinitionLevel();

	DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
	if (dictionaryPage != null) {
		try {
			this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage);
			this.isCurrentPageDictionaryEncoded = true;
		} catch (IOException e) {
			throw new IOException("could not decode the dictionary for " + descriptor, e);
		}
	} else {
		this.dictionary = null;
		this.isCurrentPageDictionaryEncoded = false;
	}
	/*
	 * Total number of values in this column (in this row group).
	 */
	long totalValueCount = pageReader.getTotalValueCount();
	if (totalValueCount == 0) {
		throw new IOException("totalValueCount == 0");
	}
}

Source File: ColumnChunkIncReadStore.java From Bats with Apache License 2.0

4 votes

@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  return columns.get(descriptor);
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

4 votes

private static DictionaryPage getDictionaryPageForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  return pageReader.readDictionaryPage();
}

Source File: TestParquetFileWriter.java From parquet-mr with Apache License 2.0

4 votes

private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPage page = pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertArrayEquals(bytes.toByteArray(), ((DataPageV1)page).getBytes().toByteArray());
}

Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0

4 votes

@Test
public void test() throws Exception {
  Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
  Path root = file.getParent();
  FileSystem fs = file.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  Encoding dataEncoding = PLAIN;
  int valueCount = 10;
  int d = 1;
  int r = 2;
  int v = 3;
  BytesInput definitionLevels = BytesInput.fromInt(d);
  BytesInput repetitionLevels = BytesInput.fromInt(r);
  Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
      .build();
  BytesInput data = BytesInput.fromInt(v);
  int rowCount = 5;
  int nullCount = 1;
  statistics.incrementNumNulls(nullCount);
  statistics.setMinMaxFromBytes(new byte[] {0, 1, 2}, new byte[] {0, 1, 2, 3});
  long pageOffset;
  long pageSize;

  {
    OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
    ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE,
        ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
    writer.start();
    writer.startBlock(rowCount);
    pageOffset = outputFile.out().getPos();
    {
      ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema,
          new HeapByteBufferAllocator(), Integer.MAX_VALUE);
      PageWriter pageWriter = store.getPageWriter(col);
      pageWriter.writePageV2(
          rowCount, nullCount, valueCount,
          repetitionLevels, definitionLevels,
          dataEncoding, data,
          statistics);
      store.flushToFileWriter(writer);
      pageSize = outputFile.out().getPos() - pageOffset;
    }
    writer.endBlock();
    writer.end(new HashMap<String, String>());
  }

  {
    ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(
        conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
    PageReadStore rowGroup = reader.readNextRowGroup();
    PageReader pageReader = rowGroup.getPageReader(col);
    DataPageV2 page = (DataPageV2)pageReader.readPage();
    assertEquals(rowCount, page.getRowCount());
    assertEquals(nullCount, page.getNullCount());
    assertEquals(valueCount, page.getValueCount());
    assertEquals(d, intValue(page.getDefinitionLevels()));
    assertEquals(r, intValue(page.getRepetitionLevels()));
    assertEquals(dataEncoding, page.getDataEncoding());
    assertEquals(v, intValue(page.getData()));

    // Checking column/offset indexes for the one page
    ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
    ColumnIndex columnIndex = reader.readColumnIndex(column);
    assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
    assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
    assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
    assertFalse(columnIndex.getNullPages().get(0));
    OffsetIndex offsetIndex = reader.readOffsetIndex(column);
    assertEquals(1, offsetIndex.getPageCount());
    assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
    assertEquals(0, offsetIndex.getFirstRowIndex(0));
    assertEquals(pageOffset, offsetIndex.getOffset(0));

    reader.close();
  }
}

Source File: ColumnChunkIncReadStore.java From dremio-oss with Apache License 2.0

4 votes

@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  return columns.get(descriptor);
}

Source File: ColumnReadStoreImpl.java From parquet-mr with Apache License 2.0

4 votes

private ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) {
  PrimitiveConverter converter = getPrimitiveConverter(path);
  return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
}

Source File: ByteColumnReader.java From flink with Apache License 2.0

4 votes

public ByteColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}

Source File: ParquetSplitReaderUtil.java From flink with Apache License 2.0

4 votes

public static ColumnReader createColumnReader(
		boolean utcTimestamp,
		LogicalType fieldType,
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	switch (fieldType.getTypeRoot()) {
		case BOOLEAN:
			return new BooleanColumnReader(descriptor, pageReader);
		case TINYINT:
			return new ByteColumnReader(descriptor, pageReader);
		case DOUBLE:
			return new DoubleColumnReader(descriptor, pageReader);
		case FLOAT:
			return new FloatColumnReader(descriptor, pageReader);
		case INTEGER:
		case DATE:
		case TIME_WITHOUT_TIME_ZONE:
			return new IntColumnReader(descriptor, pageReader);
		case BIGINT:
			return new LongColumnReader(descriptor, pageReader);
		case SMALLINT:
			return new ShortColumnReader(descriptor, pageReader);
		case CHAR:
		case VARCHAR:
		case BINARY:
		case VARBINARY:
			return new BytesColumnReader(descriptor, pageReader);
		case TIMESTAMP_WITHOUT_TIME_ZONE:
		case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
			return new TimestampColumnReader(utcTimestamp, descriptor, pageReader);
		case DECIMAL:
			switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) {
				case INT32:
					return new IntColumnReader(descriptor, pageReader);
				case INT64:
					return new LongColumnReader(descriptor, pageReader);
				case BINARY:
					return new BytesColumnReader(descriptor, pageReader);
				case FIXED_LEN_BYTE_ARRAY:
					return new FixedLenBytesColumnReader(
							descriptor, pageReader, ((DecimalType) fieldType).getPrecision());
			}
		default:
			throw new UnsupportedOperationException(fieldType + " is not supported now.");
	}
}

Source File: FloatColumnReader.java From flink with Apache License 2.0

4 votes

public FloatColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FLOAT);
}

Source File: BytesColumnReader.java From flink with Apache License 2.0

4 votes

public BytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.BINARY);
}

Source File: BooleanColumnReader.java From flink with Apache License 2.0

4 votes

public BooleanColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.BOOLEAN);
}

Source File: IntColumnReader.java From flink with Apache License 2.0

4 votes

public IntColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}

org.apache.parquet.column.page.PageReader Java Examples