org.apache.parquet.column.page.PageReader Java Examples
The following examples show how to use
org.apache.parquet.column.page.PageReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestMemPageStore.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void test() throws IOException { MemPageStore memPageStore = new MemPageStore(10); ColumnDescriptor col = new ColumnDescriptor(path , PrimitiveTypeName.INT64, 2, 2); LongStatistics stats = new LongStatistics(); PageWriter pageWriter = memPageStore.getPageWriter(col); pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN); pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN); pageWriter.writePage(BytesInput.from(new byte[743]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN); pageWriter.writePage(BytesInput.from(new byte[735]), 209, stats, BIT_PACKED, BIT_PACKED, PLAIN); PageReader pageReader = memPageStore.getPageReader(col); long totalValueCount = pageReader.getTotalValueCount(); System.out.println(totalValueCount); int total = 0; do { DataPage readPage = pageReader.readPage(); total += readPage.getValueCount(); System.out.println(readPage); // TODO: assert } while (total < totalValueCount); }
Example #2
Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * creates a reader for triplets * @param path the descriptor for the corresponding column * @param pageReader the underlying store to read from * @param converter a converter that materializes the values in this column in the current record * @param writerVersion writer version string from the Parquet file being read */ ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) { this.path = Objects.requireNonNull(path, "path cannot be null"); this.pageReader = Objects.requireNonNull(pageReader, "pageReader cannot be null"); this.converter = Objects.requireNonNull(converter, "converter cannot be null"); this.writerVersion = writerVersion; this.maxDefinitionLevel = path.getMaxDefinitionLevel(); DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage); if (converter.hasDictionarySupport()) { converter.setDictionary(dictionary); } } catch (IOException e) { throw new ParquetDecodingException("could not decode the dictionary for " + path, e); } } else { this.dictionary = null; } this.totalValueCount = pageReader.getTotalValueCount(); if (totalValueCount <= 0) { throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0"); } }
Example #3
Source File: VectorizedColumnIterator.java From iceberg with Apache License 2.0 | 5 votes |
public Dictionary setRowGroupInfo(PageReader store, boolean allPagesDictEncoded) { // setPageSource can result in a data page read. If that happens, we need // to know in advance whether all the pages in the row group are dictionary encoded or not this.vectorizedPageIterator.setAllPagesDictEncoded(allPagesDictEncoded); super.setPageSource(store); return dictionary; }
Example #4
Source File: TestStatistics.java From parquet-mr with Apache License 2.0 | 5 votes |
public void validate(MessageType schema, PageReadStore store) { for (ColumnDescriptor desc : schema.getColumns()) { PageReader reader = store.getPageReader(desc); DictionaryPage dict = reader.readDictionaryPage(); DataPage page; while ((page = reader.readPage()) != null) { validateStatsForPage(page, dict, desc); } } }
Example #5
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 5 votes |
private static List<DataPage> getPageGroupForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) { PageReader pageReader = pageReadStore.getPageReader(columnDescriptor); List<DataPage> pageGroup = new ArrayList<DataPage>(); DataPage page; while ((page = pageReader.readPage()) != null) { pageGroup.add(reusableCopy(page)); } return pageGroup; }
Example #6
Source File: TestParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validateV2Page(MessageType schema, PageReadStore pages, String[] path, int values, int rows, int nullCount, byte[] repetition, byte[] definition, byte[] data, int uncompressedSize) throws IOException { PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path)); DataPageV2 page = (DataPageV2)pageReader.readPage(); assertEquals(values, page.getValueCount()); assertEquals(rows, page.getRowCount()); assertEquals(nullCount, page.getNullCount()); assertEquals(uncompressedSize, page.getUncompressedSize()); assertArrayEquals(repetition, page.getRepetitionLevels().toByteArray()); assertArrayEquals(definition, page.getDefinitionLevels().toByteArray()); assertArrayEquals(data, page.getData().toByteArray()); }
Example #7
Source File: ColumnChunkPageReadStore.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public PageReader getPageReader(ColumnDescriptor path) { final PageReader pageReader = readers.get(path); if (pageReader == null) { throw new IllegalArgumentException(path + " is not in the store: " + readers.keySet() + " " + rowCount); } return pageReader; }
Example #8
Source File: MemPageStore.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public PageReader getPageReader(ColumnDescriptor descriptor) { MemPageWriter pageWriter = pageWriters.get(descriptor); if (pageWriter == null) { throw new UnknownColumnException(descriptor); } List<DataPage> pages = new ArrayList<>(pageWriter.getPages()); LOG.debug("initialize page reader with {} values and {} pages", pageWriter.getTotalValueCount(), pages.size()); return new MemPageReader(pageWriter.getTotalValueCount(), pages.iterator(), pageWriter.getDictionaryPage()); }
Example #9
Source File: SynchronizingColumnReader.java From parquet-mr with Apache License 2.0 | 5 votes |
SynchronizingColumnReader(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion, PrimitiveIterator.OfLong rowIndexes) { super(path, pageReader, converter, writerVersion); this.rowIndexes = rowIndexes; targetRow = Long.MIN_VALUE; consume(); }
Example #10
Source File: ColumnReadStoreImpl.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public ColumnReader getColumnReader(ColumnDescriptor path) { PrimitiveConverter converter = getPrimitiveConverter(path); PageReader pageReader = pageReadStore.getPageReader(path); Optional<PrimitiveIterator.OfLong> rowIndexes = pageReadStore.getRowIndexes(); if (rowIndexes.isPresent()) { return new SynchronizingColumnReader(path, pageReader, converter, writerVersion, rowIndexes.get()); } else { return new ColumnReaderImpl(path, pageReader, converter, writerVersion); } }
Example #11
Source File: TimestampColumnReader.java From flink with Apache License 2.0 | 5 votes |
public TimestampColumnReader( boolean utcTimestamp, ColumnDescriptor descriptor, PageReader pageReader) throws IOException { super(descriptor, pageReader); this.utcTimestamp = utcTimestamp; checkTypeName(PrimitiveType.PrimitiveTypeName.INT96); }
Example #12
Source File: FixedLenBytesColumnReader.java From flink with Apache License 2.0 | 5 votes |
public FixedLenBytesColumnReader( ColumnDescriptor descriptor, PageReader pageReader, int precision) throws IOException { super(descriptor, pageReader); checkTypeName(PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY); this.precision = precision; }
Example #13
Source File: BaseColumnIterator.java From iceberg with Apache License 2.0 | 5 votes |
public void setPageSource(PageReader source) { this.pageSource = source; this.triplesCount = source.getTotalValueCount(); this.triplesRead = 0L; this.advanceNextPageCount = 0L; BasePageIterator pageIterator = pageIterator(); pageIterator.reset(); dictionary = ParquetUtil.readDictionary(desc, pageSource); pageIterator.setDictionary(dictionary); advance(); }
Example #14
Source File: ParquetUtil.java From iceberg with Apache License 2.0 | 5 votes |
public static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) { DictionaryPage dictionaryPage = pageSource.readDictionaryPage(); if (dictionaryPage != null) { try { return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage); } catch (IOException e) { throw new ParquetDecodingException("could not decode the dictionary for " + desc, e); } } return null; }
Example #15
Source File: ColumnIterator.java From iceberg with Apache License 2.0 | 5 votes |
public void setPageSource(PageReader source) { this.pageSource = source; this.triplesCount = source.getTotalValueCount(); this.triplesRead = 0L; this.advanceNextPageCount = 0L; this.pageIterator.reset(); this.pageIterator.setDictionary(readDictionary(desc, pageSource)); advance(); }
Example #16
Source File: ColumnIterator.java From iceberg with Apache License 2.0 | 5 votes |
private static Dictionary readDictionary(ColumnDescriptor desc, PageReader pageSource) { DictionaryPage dictionaryPage = pageSource.readDictionaryPage(); if (dictionaryPage != null) { try { return dictionaryPage.getEncoding().initDictionary(desc, dictionaryPage); // if (converter.hasDictionarySupport()) { // converter.setDictionary(dictionary); // } } catch (IOException e) { throw new ParquetDecodingException("could not decode the dictionary for " + desc, e); } } return null; }
Example #17
Source File: ParquetRecordReaderTest.java From dremio-oss with Apache License 2.0 | 5 votes |
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException { PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path)); DataPageV1 page = (DataPageV1) pageReader.readPage(); assertEquals(values, page.getValueCount()); assertArrayEquals(bytes.toByteArray(), page.getBytes().toByteArray()); }
Example #18
Source File: AbstractColumnReader.java From flink with Apache License 2.0 | 5 votes |
public AbstractColumnReader( ColumnDescriptor descriptor, PageReader pageReader) throws IOException { this.descriptor = descriptor; this.pageReader = pageReader; this.maxDefLevel = descriptor.getMaxDefinitionLevel(); DictionaryPage dictionaryPage = pageReader.readDictionaryPage(); if (dictionaryPage != null) { try { this.dictionary = dictionaryPage.getEncoding().initDictionary(descriptor, dictionaryPage); this.isCurrentPageDictionaryEncoded = true; } catch (IOException e) { throw new IOException("could not decode the dictionary for " + descriptor, e); } } else { this.dictionary = null; this.isCurrentPageDictionaryEncoded = false; } /* * Total number of values in this column (in this row group). */ long totalValueCount = pageReader.getTotalValueCount(); if (totalValueCount == 0) { throw new IOException("totalValueCount == 0"); } }
Example #19
Source File: ColumnChunkIncReadStore.java From Bats with Apache License 2.0 | 4 votes |
@Override public PageReader getPageReader(ColumnDescriptor descriptor) { return columns.get(descriptor); }
Example #20
Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0 | 4 votes |
private static DictionaryPage getDictionaryPageForColumn(PageReadStore pageReadStore, ColumnDescriptor columnDescriptor) { PageReader pageReader = pageReadStore.getPageReader(columnDescriptor); return pageReader.readDictionaryPage(); }
Example #21
Source File: TestParquetFileWriter.java From parquet-mr with Apache License 2.0 | 4 votes |
private void validateContains(MessageType schema, PageReadStore pages, String[] path, int values, BytesInput bytes) throws IOException { PageReader pageReader = pages.getPageReader(schema.getColumnDescription(path)); DataPage page = pageReader.readPage(); assertEquals(values, page.getValueCount()); assertArrayEquals(bytes.toByteArray(), ((DataPageV1)page).getBytes().toByteArray()); }
Example #22
Source File: TestColumnChunkPageWriteStore.java From parquet-mr with Apache License 2.0 | 4 votes |
@Test public void test() throws Exception { Path file = new Path("target/test/TestColumnChunkPageWriteStore/test.parquet"); Path root = file.getParent(); FileSystem fs = file.getFileSystem(conf); if (fs.exists(root)) { fs.delete(root, true); } fs.mkdirs(root); MessageType schema = MessageTypeParser.parseMessageType("message test { repeated binary bar; }"); ColumnDescriptor col = schema.getColumns().get(0); Encoding dataEncoding = PLAIN; int valueCount = 10; int d = 1; int r = 2; int v = 3; BytesInput definitionLevels = BytesInput.fromInt(d); BytesInput repetitionLevels = BytesInput.fromInt(r); Statistics<?> statistics = Statistics.getBuilderForReading(Types.required(PrimitiveTypeName.BINARY).named("test_binary")) .build(); BytesInput data = BytesInput.fromInt(v); int rowCount = 5; int nullCount = 1; statistics.incrementNumNulls(nullCount); statistics.setMinMaxFromBytes(new byte[] {0, 1, 2}, new byte[] {0, 1, 2, 3}); long pageOffset; long pageSize; { OutputFileForTesting outputFile = new OutputFileForTesting(file, conf); ParquetFileWriter writer = new ParquetFileWriter(outputFile, schema, Mode.CREATE, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.MAX_PADDING_SIZE_DEFAULT); writer.start(); writer.startBlock(rowCount); pageOffset = outputFile.out().getPos(); { ColumnChunkPageWriteStore store = new ColumnChunkPageWriteStore(compressor(GZIP), schema, new HeapByteBufferAllocator(), Integer.MAX_VALUE); PageWriter pageWriter = store.getPageWriter(col); pageWriter.writePageV2( rowCount, nullCount, valueCount, repetitionLevels, definitionLevels, dataEncoding, data, statistics); store.flushToFileWriter(writer); pageSize = outputFile.out().getPos() - pageOffset; } writer.endBlock(); writer.end(new HashMap<String, String>()); } { ParquetMetadata footer = ParquetFileReader.readFooter(conf, file, NO_FILTER); ParquetFileReader reader = new ParquetFileReader( conf, footer.getFileMetaData(), file, footer.getBlocks(), schema.getColumns()); PageReadStore rowGroup = reader.readNextRowGroup(); PageReader pageReader = rowGroup.getPageReader(col); DataPageV2 page = (DataPageV2)pageReader.readPage(); assertEquals(rowCount, page.getRowCount()); assertEquals(nullCount, page.getNullCount()); assertEquals(valueCount, page.getValueCount()); assertEquals(d, intValue(page.getDefinitionLevels())); assertEquals(r, intValue(page.getRepetitionLevels())); assertEquals(dataEncoding, page.getDataEncoding()); assertEquals(v, intValue(page.getData())); // Checking column/offset indexes for the one page ColumnChunkMetaData column = footer.getBlocks().get(0).getColumns().get(0); ColumnIndex columnIndex = reader.readColumnIndex(column); assertArrayEquals(statistics.getMinBytes(), columnIndex.getMinValues().get(0).array()); assertArrayEquals(statistics.getMaxBytes(), columnIndex.getMaxValues().get(0).array()); assertEquals(statistics.getNumNulls(), columnIndex.getNullCounts().get(0).longValue()); assertFalse(columnIndex.getNullPages().get(0)); OffsetIndex offsetIndex = reader.readOffsetIndex(column); assertEquals(1, offsetIndex.getPageCount()); assertEquals(pageSize, offsetIndex.getCompressedPageSize(0)); assertEquals(0, offsetIndex.getFirstRowIndex(0)); assertEquals(pageOffset, offsetIndex.getOffset(0)); reader.close(); } }
Example #23
Source File: ColumnChunkIncReadStore.java From dremio-oss with Apache License 2.0 | 4 votes |
@Override public PageReader getPageReader(ColumnDescriptor descriptor) { return columns.get(descriptor); }
Example #24
Source File: ColumnReadStoreImpl.java From parquet-mr with Apache License 2.0 | 4 votes |
private ColumnReaderImpl newMemColumnReader(ColumnDescriptor path, PageReader pageReader) { PrimitiveConverter converter = getPrimitiveConverter(path); return new ColumnReaderImpl(path, pageReader, converter, writerVersion); }
Example #25
Source File: ByteColumnReader.java From flink with Apache License 2.0 | 4 votes |
public ByteColumnReader( ColumnDescriptor descriptor, PageReader pageReader) throws IOException { super(descriptor, pageReader); checkTypeName(PrimitiveType.PrimitiveTypeName.INT32); }
Example #26
Source File: ParquetSplitReaderUtil.java From flink with Apache License 2.0 | 4 votes |
public static ColumnReader createColumnReader( boolean utcTimestamp, LogicalType fieldType, ColumnDescriptor descriptor, PageReader pageReader) throws IOException { switch (fieldType.getTypeRoot()) { case BOOLEAN: return new BooleanColumnReader(descriptor, pageReader); case TINYINT: return new ByteColumnReader(descriptor, pageReader); case DOUBLE: return new DoubleColumnReader(descriptor, pageReader); case FLOAT: return new FloatColumnReader(descriptor, pageReader); case INTEGER: case DATE: case TIME_WITHOUT_TIME_ZONE: return new IntColumnReader(descriptor, pageReader); case BIGINT: return new LongColumnReader(descriptor, pageReader); case SMALLINT: return new ShortColumnReader(descriptor, pageReader); case CHAR: case VARCHAR: case BINARY: case VARBINARY: return new BytesColumnReader(descriptor, pageReader); case TIMESTAMP_WITHOUT_TIME_ZONE: case TIMESTAMP_WITH_LOCAL_TIME_ZONE: return new TimestampColumnReader(utcTimestamp, descriptor, pageReader); case DECIMAL: switch (descriptor.getPrimitiveType().getPrimitiveTypeName()) { case INT32: return new IntColumnReader(descriptor, pageReader); case INT64: return new LongColumnReader(descriptor, pageReader); case BINARY: return new BytesColumnReader(descriptor, pageReader); case FIXED_LEN_BYTE_ARRAY: return new FixedLenBytesColumnReader( descriptor, pageReader, ((DecimalType) fieldType).getPrecision()); } default: throw new UnsupportedOperationException(fieldType + " is not supported now."); } }
Example #27
Source File: FloatColumnReader.java From flink with Apache License 2.0 | 4 votes |
public FloatColumnReader( ColumnDescriptor descriptor, PageReader pageReader) throws IOException { super(descriptor, pageReader); checkTypeName(PrimitiveType.PrimitiveTypeName.FLOAT); }
Example #28
Source File: BytesColumnReader.java From flink with Apache License 2.0 | 4 votes |
public BytesColumnReader( ColumnDescriptor descriptor, PageReader pageReader) throws IOException { super(descriptor, pageReader); checkTypeName(PrimitiveType.PrimitiveTypeName.BINARY); }
Example #29
Source File: BooleanColumnReader.java From flink with Apache License 2.0 | 4 votes |
public BooleanColumnReader( ColumnDescriptor descriptor, PageReader pageReader) throws IOException { super(descriptor, pageReader); checkTypeName(PrimitiveType.PrimitiveTypeName.BOOLEAN); }
Example #30
Source File: IntColumnReader.java From flink with Apache License 2.0 | 4 votes |
public IntColumnReader( ColumnDescriptor descriptor, PageReader pageReader) throws IOException { super(descriptor, pageReader); checkTypeName(PrimitiveType.PrimitiveTypeName.INT32); }