org.apache.parquet.column.page.PageReader Java Examples

The following examples show how to use org.apache.parquet.column.page.PageReader. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestMemPageStore.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Test
public void test() throws IOException {
  MemPageStore memPageStore = new MemPageStore(10);
  ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2);
  LongStatistics stats = new LongStatistics();
  PageWriter pageWriter = memPageStore.getPageWriter(col);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN);
  PageReader pageReader = memPageStore.getPageReader(col);
  long totalValueCount = pageReader.getTotalValueCount();
  System.out.println(totalValueCount);
  int total = 0;
  do {
    DataPage readPage = pageReader.readPage();
    total += readPage.getValueCount();
    System.out.println(readPage);
    // TODO: assert
  } while (total < totalValueCount);
}
 
Example #2
Source File: ColumnReaderBase.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
/**
 * creates a reader for triplets
 * @param path the descriptor for the corresponding column
 * @param pageReader the underlying store to read from
 * @param converter a converter that materializes the values in this column in the current record
 * @param writerVersion writer version string from the Parquet file being read
 */
ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.pageReader = Objects.requireNonNull(pageReader, "pageReader cannot be null");
  this.converter = Objects.requireNonNull(converter, "converter cannot be null");
  this.writerVersion = writerVersion;
  this.maxDefinitionLevel = path.getMaxDefinitionLevel();
  DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
      if (converter.hasDictionarySupport()) {
        converter.setDictionary(dictionary);
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
    }
  } else {
    this.dictionary = null;
  }
  this.totalValueCount = pageReader.getTotalValueCount();
  if (totalValueCount <= 0) {
    throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
  }
}
 
Example #3
Source File: VectorizedColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) {
  // setPageSource can result in a data page read. If that happens, we need
  // to know in advance whether all the pages in the row group are dictionary encoded or not
  this.vectorizedPageIterator.setAllPagesDictEncoded(allPagesDictEncoded);
  super.setPageSource(store);
  return dictionary;
}
 
Example #4
Source File: TestStatistics.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
public void validate(MessageType schema, PageReadStore store) {
  for (ColumnDescriptor desc : schema.getColumns()) {
    PageReader reader = store.getPageReader(desc);
    DictionaryPage dict = reader.readDictionaryPage();
    DataPage page;
    while ((page = reader.readPage()) != null) {
      validateStatsForPage(page, dict, desc);
    }
  }
}
 
Example #5
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  List<DataPage> pageGroup = new ArrayList<DataPage>();

  DataPage page;
  while ((page = pageReader.readPage()) != null) {
    pageGroup.add(reusableCopy(page));
  }

  return pageGroup;
}
 
Example #6
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private void validateV2Page(MessageType schema, PageReadStore pages, String[] path, int values, int rows, int nullCount,
                            byte[] repetition, byte[] definition, byte[] data, int uncompressedSize) throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV2 page = (DataPageV2)pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertEquals(rows, page.getRowCount());
  assertEquals(nullCount, page.getNullCount());
  assertEquals(uncompressedSize, page.getUncompressedSize());
  assertArrayEquals(repetition, page.getRepetitionLevels().toByteArray());
  assertArrayEquals(definition, page.getDefinitionLevels().toByteArray());
  assertArrayEquals(data, page.getData().toByteArray());
}
 
Example #7
Source File: ColumnChunkPageReadStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor path) {
  final PageReader pageReader = readers.get(path);
  if (pageReader == null) {
    throw new IllegalArgumentException(path + " is not in the store: " + readers.keySet() + " " + rowCount);
  }
  return pageReader;
}
 
Example #8
Source File: MemPageStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  MemPageWriter pageWriter = pageWriters.get(descriptor);
  if (pageWriter == null) {
    throw new UnknownColumnException(descriptor);
  }
  List<DataPage> pages = new ArrayList<>(pageWriter.getPages());
  LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size());
  return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage());
}
 
Example #9
Source File: SynchronizingColumnReader.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
SynchronizingColumnReader(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter,
    ParsedVersion writerVersion, PrimitiveIterator.OfLong rowIndexes) {
  super(path, pageReader, converter, writerVersion);
  this.rowIndexes = rowIndexes;
  targetRow = Long.MIN_VALUE;
  consume();
}
 
Example #10
Source File: ColumnReadStoreImpl.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
@Override
public ColumnReader getColumnReader(ColumnDescriptor path) {
  PrimitiveConverter converter = getPrimitiveConverter(path);
  PageReader pageReader = pageReadStore.getPageReader(path);
  Optional<PrimitiveIterator.OfLong> rowIndexes = pageReadStore.getRowIndexes();
  if (rowIndexes.isPresent()) {
    return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, rowIndexes.get());
  } else {
    return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
  }
}
 
Example #11
Source File: TimestampColumnReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public TimestampColumnReader(
		boolean utcTimestamp,
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	this.utcTimestamp = utcTimestamp;
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT96);
}
 
Example #12
Source File: FixedLenBytesColumnReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public FixedLenBytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader,
		int precision) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY);
	this.precision = precision;
}
 
Example #13
Source File: BaseColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public void setPageSource(PageReader source) {
  this.pageSource = source;
  this.triplesCount = source.getTotalValueCount();
  this.triplesRead = 0L;
  this.advanceNextPageCount = 0L;
  BasePageIterator pageIterator = pageIterator();
  pageIterator.reset();
  dictionary = ParquetUtil.readDictionary(desc, pageSource);
  pageIterator.setDictionary(dictionary);
  advance();
}
 
Example #14
Source File: ParquetUtil.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
  DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
    }
  }
  return null;
}
 
Example #15
Source File: ColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
public void setPageSource(PageReader source) {
  this.pageSource = source;
  this.triplesCount = source.getTotalValueCount();
  this.triplesRead = 0L;
  this.advanceNextPageCount = 0L;
  this.pageIterator.reset();
  this.pageIterator.setDictionary(readDictionary(desc, pageSource));
  advance();
}
 
Example #16
Source File: ColumnIterator.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) {
    DictionaryPage dictionaryPage = pageSource.readDictionaryPage();
    if (dictionaryPage != null) {
      try {
        return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage);
//        if (converter.hasDictionarySupport()) {
//          converter.setDictionary(dictionary);
//        }
      } catch (IOException e) {
        throw new ParquetDecodingException("could not decode the dictionary for " + desc, e);
      }
    }
    return null;
  }
 
Example #17
Source File: ParquetRecordReaderTest.java    From dremio-oss with Apache License 2.0 5 votes vote down vote up
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes)
    throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPageV1 page = (DataPageV1) pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray());
}
 
Example #18
Source File: AbstractColumnReader.java    From flink with Apache License 2.0 5 votes vote down vote up
public AbstractColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	this.descriptor = descriptor;
	this.pageReader = pageReader;
	this.maxDefLevel = descriptor.getMaxDefinitionLevel();

	DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
	if (dictionaryPage != null) {
		try {
			this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage);
			this.isCurrentPageDictionaryEncoded = true;
		} catch (IOException e) {
			throw new IOException("could not decode the dictionary for " + descriptor, e);
		}
	} else {
		this.dictionary = null;
		this.isCurrentPageDictionaryEncoded = false;
	}
	/*
	 * Total number of values in this column (in this row group).
	 */
	long totalValueCount = pageReader.getTotalValueCount();
	if (totalValueCount == 0) {
		throw new IOException("totalValueCount == 0");
	}
}
 
Example #19
Source File: ColumnChunkIncReadStore.java    From Bats with Apache License 2.0 4 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  return columns.get(descriptor);
}
 
Example #20
Source File: FileEncodingsIT.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private static DictionaryPage getDictionaryPageForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) {
  PageReader pageReader = pageReadStore.getPageReader(columnDescriptor);
  return pageReader.readDictionaryPage();
}
 
Example #21
Source File: TestParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException {
  PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path));
  DataPage page = pageReader.readPage();
  assertEquals(values, page.getValueCount());
  assertArrayEquals(bytes.toByteArray(), ((DataPageV1)page).getBytes().toByteArray());
}
 
Example #22
Source File: TestColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
@Test
public void test() throws Exception {
  Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet");
  Path root = file.getParent();
  FileSystem fs = file.getFileSystem(conf);
  if (fs.exists(root)) {
    fs.delete(root, true);
  }
  fs.mkdirs(root);
  MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }");
  ColumnDescriptor col = schema.getColumns().get(0);
  Encoding dataEncoding = PLAIN;
  int valueCount = 10;
  int d = 1;
  int r = 2;
  int v = 3;
  BytesInput definitionLevels = BytesInput.fromInt(d);
  BytesInput repetitionLevels = BytesInput.fromInt(r);
  Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary"))
      .build();
  BytesInput data = BytesInput.fromInt(v);
  int rowCount = 5;
  int nullCount = 1;
  statistics.incrementNumNulls(nullCount);
  statistics.setMinMaxFromBytes(new byte[] {0, 1, 2}, new byte[] {0, 1, 2, 3});
  long pageOffset;
  long pageSize;

  {
    OutputFileForTesting outputFile = new OutputFileForTesting(file, conf);
    ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE,
        ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT);
    writer.start();
    writer.startBlock(rowCount);
    pageOffset = outputFile.out().getPos();
    {
      ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema,
          new HeapByteBufferAllocator(), Integer.MAX_VALUE);
      PageWriter pageWriter = store.getPageWriter(col);
      pageWriter.writePageV2(
          rowCount, nullCount, valueCount,
          repetitionLevels, definitionLevels,
          dataEncoding, data,
          statistics);
      store.flushToFileWriter(writer);
      pageSize = outputFile.out().getPos() - pageOffset;
    }
    writer.endBlock();
    writer.end(new HashMap<String, String>());
  }

  {
    ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER);
    ParquetFileReader reader = new ParquetFileReader(
        conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns());
    PageReadStore rowGroup = reader.readNextRowGroup();
    PageReader pageReader = rowGroup.getPageReader(col);
    DataPageV2 page = (DataPageV2)pageReader.readPage();
    assertEquals(rowCount, page.getRowCount());
    assertEquals(nullCount, page.getNullCount());
    assertEquals(valueCount, page.getValueCount());
    assertEquals(d, intValue(page.getDefinitionLevels()));
    assertEquals(r, intValue(page.getRepetitionLevels()));
    assertEquals(dataEncoding, page.getDataEncoding());
    assertEquals(v, intValue(page.getData()));

    // Checking column/offset indexes for the one page
    ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0);
    ColumnIndex columnIndex = reader.readColumnIndex(column);
    assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array());
    assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array());
    assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue());
    assertFalse(columnIndex.getNullPages().get(0));
    OffsetIndex offsetIndex = reader.readOffsetIndex(column);
    assertEquals(1, offsetIndex.getPageCount());
    assertEquals(pageSize, offsetIndex.getCompressedPageSize(0));
    assertEquals(0, offsetIndex.getFirstRowIndex(0));
    assertEquals(pageOffset, offsetIndex.getOffset(0));

    reader.close();
  }
}
 
Example #23
Source File: ColumnChunkIncReadStore.java    From dremio-oss with Apache License 2.0 4 votes vote down vote up
@Override
public PageReader getPageReader(ColumnDescriptor descriptor) {
  return columns.get(descriptor);
}
 
Example #24
Source File: ColumnReadStoreImpl.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) {
  PrimitiveConverter converter = getPrimitiveConverter(path);
  return new ColumnReaderImpl(path, pageReader, converter, writerVersion);
}
 
Example #25
Source File: ByteColumnReader.java    From flink with Apache License 2.0 4 votes vote down vote up
public ByteColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}
 
Example #26
Source File: ParquetSplitReaderUtil.java    From flink with Apache License 2.0 4 votes vote down vote up
public static ColumnReader createColumnReader(
		boolean utcTimestamp,
		LogicalType fieldType,
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	switch (fieldType.getTypeRoot()) {
		case BOOLEAN:
			return new BooleanColumnReader(descriptor, pageReader);
		case TINYINT:
			return new ByteColumnReader(descriptor, pageReader);
		case DOUBLE:
			return new DoubleColumnReader(descriptor, pageReader);
		case FLOAT:
			return new FloatColumnReader(descriptor, pageReader);
		case INTEGER:
		case DATE:
		case TIME_WITHOUT_TIME_ZONE:
			return new IntColumnReader(descriptor, pageReader);
		case BIGINT:
			return new LongColumnReader(descriptor, pageReader);
		case SMALLINT:
			return new ShortColumnReader(descriptor, pageReader);
		case CHAR:
		case VARCHAR:
		case BINARY:
		case VARBINARY:
			return new BytesColumnReader(descriptor, pageReader);
		case TIMESTAMP_WITHOUT_TIME_ZONE:
		case TIMESTAMP_WITH_LOCAL_TIME_ZONE:
			return new TimestampColumnReader(utcTimestamp, descriptor, pageReader);
		case DECIMAL:
			switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) {
				case INT32:
					return new IntColumnReader(descriptor, pageReader);
				case INT64:
					return new LongColumnReader(descriptor, pageReader);
				case BINARY:
					return new BytesColumnReader(descriptor, pageReader);
				case FIXED_LEN_BYTE_ARRAY:
					return new FixedLenBytesColumnReader(
							descriptor, pageReader, ((DecimalType) fieldType).getPrecision());
			}
		default:
			throw new UnsupportedOperationException(fieldType + " is not supported now.");
	}
}
 
Example #27
Source File: FloatColumnReader.java    From flink with Apache License 2.0 4 votes vote down vote up
public FloatColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.FLOAT);
}
 
Example #28
Source File: BytesColumnReader.java    From flink with Apache License 2.0 4 votes vote down vote up
public BytesColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.BINARY);
}
 
Example #29
Source File: BooleanColumnReader.java    From flink with Apache License 2.0 4 votes vote down vote up
public BooleanColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.BOOLEAN);
}
 
Example #30
Source File: IntColumnReader.java    From flink with Apache License 2.0 4 votes vote down vote up
public IntColumnReader(
		ColumnDescriptor descriptor,
		PageReader pageReader) throws IOException {
	super(descriptor, pageReader);
	checkTypeName(PrimitiveType.PrimitiveTypeName.INT32);
}