Java Code Examples for org.apache.parquet.format.PageHeader#getCompressed_page_size()
The following examples show how to use
org.apache.parquet.format.PageHeader#getCompressed_page_size() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PageReader.java From Bats with Apache License 2.0 | 7 votes |
private void readDictionaryPage(final PageHeader pageHeader, final ColumnReader<?> parentStatus) throws IOException { int compressedSize = pageHeader.getCompressed_page_size(); int uncompressedSize = pageHeader.getUncompressed_page_size(); final DrillBuf dictionaryData = readPage(pageHeader, compressedSize, uncompressedSize); allocatedDictionaryBuffers.add(dictionaryData); DictionaryPage page = new DictionaryPage( asBytesInput(dictionaryData, 0, uncompressedSize), pageHeader.uncompressed_page_size, pageHeader.dictionary_page_header.num_values, valueOf(pageHeader.dictionary_page_header.encoding.name())); this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page); }
Example 2
Source File: AsyncPageReader.java From Bats with Apache License 2.0 | 6 votes |
private DrillBuf decompress(PageHeader pageHeader, DrillBuf compressedData) { DrillBuf pageDataBuf = null; Stopwatch timer = Stopwatch.createUnstarted(); long timeToRead; int compressedSize = pageHeader.getCompressed_page_size(); int uncompressedSize = pageHeader.getUncompressed_page_size(); pageDataBuf = allocateTemporaryBuffer(uncompressedSize); try { timer.start(); CompressionCodecName codecName = parentColumnReader.columnChunkMetaData.getCodec(); ByteBuffer input = compressedData.nioBuffer(0, compressedSize); ByteBuffer output = pageDataBuf.nioBuffer(0, uncompressedSize); DecompressionHelper decompressionHelper = new DecompressionHelper(codecName); decompressionHelper.decompress(input, compressedSize, output, uncompressedSize); pageDataBuf.writerIndex(uncompressedSize); timeToRead = timer.elapsed(TimeUnit.NANOSECONDS); this.updateStats(pageHeader, "Decompress", 0, timeToRead, compressedSize, uncompressedSize); } catch (IOException e) { handleAndThrowException(e, "Error decompressing data."); } return pageDataBuf; }
Example 3
Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0 | 6 votes |
private DictionaryPage readCompressedDictionary( PageHeader pageHeader, SeekableInputStream fin) throws IOException { DictionaryPageHeader dictHeader = pageHeader.getDictionary_page_header(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); byte [] dictPageBytes = new byte[compressedPageSize]; fin.readFully(dictPageBytes); BytesInput bin = BytesInput.from(dictPageBytes); return new DictionaryPage( bin, uncompressedPageSize, dictHeader.getNum_values(), converter.getEncoding(dictHeader.getEncoding())); }
Example 4
Source File: ParquetColumnChunk.java From presto with Apache License 2.0 | 5 votes |
public PageReader readAllPages() throws IOException { List<DataPage> pages = new ArrayList<>(); DictionaryPage dictionaryPage = null; long valueCount = 0; while (valueCount < descriptor.getColumnChunkMetaData().getValueCount()) { PageHeader pageHeader = readPageHeader(); int uncompressedPageSize = pageHeader.getUncompressed_page_size(); int compressedPageSize = pageHeader.getCompressed_page_size(); switch (pageHeader.type) { case DICTIONARY_PAGE: if (dictionaryPage != null) { throw new ParquetCorruptionException("%s has more than one dictionary page in column chunk", descriptor.getColumnDescriptor()); } dictionaryPage = readDictionaryPage(pageHeader, uncompressedPageSize, compressedPageSize); break; case DATA_PAGE: valueCount += readDataPageV1(pageHeader, uncompressedPageSize, compressedPageSize, pages); break; case DATA_PAGE_V2: valueCount += readDataPageV2(pageHeader, uncompressedPageSize, compressedPageSize, pages); break; default: input.skip(compressedPageSize); break; } } return new PageReader(descriptor.getColumnChunkMetaData().getCodec(), pages, dictionaryPage); }
Example 5
Source File: PageReader.java From dremio-oss with Apache License 2.0 | 5 votes |
private void readDictionaryPage(final PageHeader pageHeader, final ColumnReader<?> parentStatus) throws IOException { int compressedSize = pageHeader.getCompressed_page_size(); int uncompressedSize = pageHeader.getUncompressed_page_size(); final ArrowBuf dictionaryData = allocateDictionaryBuffer(uncompressedSize); readPage(pageHeader, compressedSize, uncompressedSize, dictionaryData); DictionaryPage page = new DictionaryPage( asBytesInput(dictionaryData, 0, uncompressedSize), pageHeader.uncompressed_page_size, pageHeader.dictionary_page_header.num_values, valueOf(pageHeader.dictionary_page_header.encoding.name())); this.dictionary = page.getEncoding().initDictionary(parentStatus.columnDescriptor, page); }
Example 6
Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0 | 5 votes |
private List<Long> getOffsets(TransParquetFileReader reader, ColumnChunkMetaData chunk) throws IOException { List<Long> offsets = new ArrayList<>(); reader.setStreamPosition(chunk.getStartingPos()); long readValues = 0; long totalChunkValues = chunk.getValueCount(); while (readValues < totalChunkValues) { long curOffset = reader.getPos(); PageHeader pageHeader = reader.readPageHeader(); switch (pageHeader.type) { case DICTIONARY_PAGE: compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader); break; case DATA_PAGE: DataPageHeader headerV1 = pageHeader.data_page_header; offsets.add(curOffset); compressionConverter.readBlock(pageHeader.getCompressed_page_size(), reader); readValues += headerV1.getNum_values(); break; case DATA_PAGE_V2: DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2; offsets.add(curOffset); int rlLength = headerV2.getRepetition_levels_byte_length(); compressionConverter.readBlock(rlLength, reader); int dlLength = headerV2.getDefinition_levels_byte_length(); compressionConverter.readBlock(dlLength, reader); int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength; compressionConverter.readBlock(payLoadLength, reader); readValues += headerV2.getNum_values(); break; default: throw new IOException("Not recognized page type"); } } return offsets; }
Example 7
Source File: CompressionConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
private void processChunk(TransParquetFileReader reader, ParquetFileWriter writer, ColumnChunkMetaData chunk, String createdBy, CompressionCodecName codecName) throws IOException { CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0); CompressionCodecFactory.BytesInputDecompressor decompressor = codecFactory.getDecompressor(chunk.getCodec()); CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(codecName); ColumnIndex columnIndex = reader.readColumnIndex(chunk); OffsetIndex offsetIndex = reader.readOffsetIndex(chunk); reader.setStreamPosition(chunk.getStartingPos()); DictionaryPage dictionaryPage = null; long readValues = 0; Statistics statistics = null; ParquetMetadataConverter converter = new ParquetMetadataConverter(); int pageIndex = 0; long totalChunkValues = chunk.getValueCount(); while (readValues < totalChunkValues) { PageHeader pageHeader = reader.readPageHeader(); int compressedPageSize = pageHeader.getCompressed_page_size(); byte[] pageLoad; switch (pageHeader.type) { case DICTIONARY_PAGE: if (dictionaryPage != null) { throw new IOException("has more than one dictionary page in column chunk"); } DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header; pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size()); writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad), pageHeader.getUncompressed_page_size(), dictPageHeader.getNum_values(), converter.getEncoding(dictPageHeader.getEncoding()))); break; case DATA_PAGE: DataPageHeader headerV1 = pageHeader.data_page_header; pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size()); statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV1.getStatistics(), columnIndex, pageIndex, converter); readValues += headerV1.getNum_values(); if (offsetIndex != null) { long rowCount = 1 + offsetIndex.getLastRowIndex(pageIndex, totalChunkValues) - offsetIndex.getFirstRowIndex(pageIndex); writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), statistics, toIntWithCheck(rowCount), converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding())); } else { writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()), pageHeader.getUncompressed_page_size(), BytesInput.from(pageLoad), statistics, converter.getEncoding(headerV1.getRepetition_level_encoding()), converter.getEncoding(headerV1.getDefinition_level_encoding()), converter.getEncoding(headerV1.getEncoding())); } pageIndex++; break; case DATA_PAGE_V2: DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2; int rlLength = headerV2.getRepetition_levels_byte_length(); BytesInput rlLevels = readBlockAllocate(rlLength, reader); int dlLength = headerV2.getDefinition_levels_byte_length(); BytesInput dlLevels = readBlockAllocate(dlLength, reader); int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength; int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength; pageLoad = translatePageLoad(reader, headerV2.is_compressed, compressor, decompressor, payLoadLength, rawDataLength); statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV2.getStatistics(), columnIndex, pageIndex, converter); readValues += headerV2.getNum_values(); writer.writeDataPageV2(headerV2.getNum_rows(), headerV2.getNum_nulls(), headerV2.getNum_values(), rlLevels, dlLevels, converter.getEncoding(headerV2.getEncoding()), BytesInput.from(pageLoad), rawDataLength, statistics); pageIndex++; break; default: LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize); break; } } }