Java Code Examples for org.apache.parquet.column.values.ValuesReader#initFromPage()
The following examples show how to use
org.apache.parquet.column.values.ValuesReader#initFromPage() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: PrimitiveColumnReader.java From presto with Apache License 2.0 | 6 votes |
private ValuesReader initDataReader(ParquetEncoding dataEncoding, int valueCount, ByteBufferInputStream in) { ValuesReader valuesReader; if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException("Dictionary is missing for Page"); } valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary); } else { valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES); } try { valuesReader.initFromPage(valueCount, in); return valuesReader; } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e); } }
Example 2
Source File: BitPackingPerfTest.java From parquet-mr with Apache License 2.0 | 6 votes |
private static long readNTimes(byte[] bytes, int[] result, ValuesReader r) throws IOException { System.out.println(); long t = 0; int N = 10; System.gc(); System.out.print(" " + r.getClass().getSimpleName()); System.out.print(" no gc <"); for (int k = 0; k < N; k++) { long t2 = System.nanoTime(); r.initFromPage(result.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes))); for (int i = 0; i < result.length; i++) { result[i] = r.readInteger(); } long t3 = System.nanoTime(); t += t3 - t2; } System.out.println("> read in " + t/1000 + "µs " + (N * result.length / (t / 1000)) + " values per µs"); verify(result); return t; }
Example 3
Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0 | 6 votes |
private void readPageV1(DataPageV1 page) { ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL); ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL); this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader); this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader); int valueCount = page.getValueCount(); try { BytesInput bytes = page.getBytes(); LOG.debug("page size {} bytes and {} values", bytes.size(), valueCount); LOG.debug("reading repetition levels at 0"); ByteBufferInputStream in = bytes.toInputStream(); rlReader.initFromPage(valueCount, in); LOG.debug("reading definition levels at {}", in.position()); dlReader.initFromPage(valueCount, in); LOG.debug("reading data at {}", in.position()); initDataReader(page.getValueEncoding(), in, valueCount); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + path, e); } newPageInitialized(page); }
Example 4
Source File: AbstractColumnReader.java From flink with Apache License 2.0 | 6 votes |
private void readPageV1(DataPageV1 page) throws IOException { this.pageValueCount = page.getValueCount(); ValuesReader rlReader = page.getRlEncoding().getValuesReader(descriptor, REPETITION_LEVEL); // Initialize the decoders. if (page.getDlEncoding() != Encoding.RLE && descriptor.getMaxDefinitionLevel() != 0) { throw new UnsupportedOperationException("Unsupported encoding: " + page.getDlEncoding()); } int bitWidth = BytesUtils.getWidthFromMaxInt(descriptor.getMaxDefinitionLevel()); this.runLenDecoder = new RunLengthDecoder(bitWidth); try { BytesInput bytes = page.getBytes(); ByteBufferInputStream in = bytes.toInputStream(); rlReader.initFromPage(pageValueCount, in); this.runLenDecoder.initFromStream(pageValueCount, in); prepareNewPage(page.getValueEncoding(), in); } catch (IOException e) { throw new IOException("could not read page " + page + " in col " + descriptor, e); } }
Example 5
Source File: PageIterator.java From iceberg with Apache License 2.0 | 6 votes |
private void initFromPage(DataPageV1 page) { this.triplesCount = page.getValueCount(); ValuesReader rlReader = page.getRlEncoding().getValuesReader(desc, REPETITION_LEVEL); ValuesReader dlReader = page.getDlEncoding().getValuesReader(desc, DEFINITION_LEVEL); this.repetitionLevels = new ValuesReaderIntIterator(rlReader); this.definitionLevels = new ValuesReaderIntIterator(dlReader); try { BytesInput bytes = page.getBytes(); LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount); LOG.debug("reading repetition levels at 0"); ByteBufferInputStream in = bytes.toInputStream(); rlReader.initFromPage(triplesCount, in); LOG.debug("reading definition levels at {}", in.position()); dlReader.initFromPage(triplesCount, in); LOG.debug("reading data at {}", in.position()); initDataReader(page.getValueEncoding(), in, page.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + page + " in col " + desc, e); } }
Example 6
Source File: BasePageIterator.java From iceberg with Apache License 2.0 | 6 votes |
protected void initFromPage(DataPageV1 initPage) { this.triplesCount = initPage.getValueCount(); ValuesReader rlReader = initPage.getRlEncoding().getValuesReader(desc, ValuesType.REPETITION_LEVEL); this.repetitionLevels = new ValuesReaderIntIterator(rlReader); try { BytesInput bytes = initPage.getBytes(); LOG.debug("page size {} bytes and {} records", bytes.size(), triplesCount); LOG.debug("reading repetition levels at 0"); ByteBufferInputStream in = bytes.toInputStream(); rlReader.initFromPage(triplesCount, in); LOG.debug("reading definition levels at {}", in.position()); initDefinitionLevelsReader(initPage, desc, in, triplesCount); LOG.debug("reading data at {}", in.position()); initDataReader(initPage.getValueEncoding(), in, initPage.getValueCount()); } catch (IOException e) { throw new ParquetDecodingException("could not read page " + initPage + " in col " + desc, e); } }
Example 7
Source File: PrimitiveColumnReader.java From presto with Apache License 2.0 | 6 votes |
private ValuesReader readPageV1(DataPageV1 page) { ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL); ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL); repetitionReader = new LevelValuesReader(rlReader); definitionReader = new LevelValuesReader(dlReader); try { ByteBufferInputStream in = toInputStream(page.getSlice()); rlReader.initFromPage(page.getValueCount(), in); dlReader.initFromPage(page.getValueCount(), in); return initDataReader(page.getValueEncoding(), page.getValueCount(), in); } catch (IOException e) { throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e); } }
Example 8
Source File: PageIterator.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected void initDefinitionLevelsReader(DataPageV1 dataPageV1, ColumnDescriptor desc, ByteBufferInputStream in, int triplesCount) throws IOException { ValuesReader dlReader = dataPageV1.getDlEncoding().getValuesReader(desc, ValuesType.DEFINITION_LEVEL); this.definitionLevels = new ValuesReaderIntIterator(dlReader); dlReader.initFromPage(triplesCount, in); }
Example 9
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testSkipInBinaryDictionary() throws Exception { ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(1000, 10000); writeRepeated(100, cw, "a"); writeDistinct(100, cw, "b"); assertEquals(PLAIN_DICTIONARY, cw.getEncoding()); // Test skip and skip-n with dictionary encoding ByteBufferInputStream stream = cw.getBytes().toInputStream(); DictionaryValuesReader cr = initDicReader(cw, BINARY); cr.initFromPage(200, stream); for (int i = 0; i < 100; i += 2) { assertEquals(Binary.fromString("a" + i % 10), cr.readBytes()); cr.skip(); } int skipCount; for (int i = 0; i < 100; i += skipCount + 1) { skipCount = (100 - i) / 2; assertEquals(Binary.fromString("b" + i), cr.readBytes()); cr.skip(skipCount); } // Ensure fallback writeDistinct(1000, cw, "c"); assertEquals(PLAIN, cw.getEncoding()); // Test skip and skip-n with plain encoding (after fallback) ValuesReader plainReader = new BinaryPlainValuesReader(); plainReader.initFromPage(1200, cw.getBytes().toInputStream()); plainReader.skip(200); for (int i = 0; i < 100; i += 2) { assertEquals("c" + i, plainReader.readBytes().toStringUsingUTF8()); plainReader.skip(); } for (int i = 100; i < 1000; i += skipCount + 1) { skipCount = (1000 - i) / 2; assertEquals(Binary.fromString("c" + i), plainReader.readBytes()); plainReader.skip(skipCount); } }
Example 10
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testBinaryDictionaryFallBack() throws IOException { int slabSize = 100; int maxDictionaryByteSize = 50; final ValuesWriter cw = newPlainBinaryDictionaryValuesWriter(maxDictionaryByteSize, slabSize); int fallBackThreshold = maxDictionaryByteSize; int dataSize=0; for (long i = 0; i < 100; i++) { Binary binary = Binary.fromString("str" + i); cw.writeBytes(binary); dataSize += (binary.length() + 4); if (dataSize < fallBackThreshold) { assertEquals(PLAIN_DICTIONARY, cw.getEncoding()); } else { assertEquals(PLAIN, cw.getEncoding()); } } //Fallbacked to Plain encoding, therefore use PlainValuesReader to read it back ValuesReader reader = new BinaryPlainValuesReader(); reader.initFromPage(100, cw.getBytes().toInputStream()); for (long i = 0; i < 100; i++) { assertEquals(Binary.fromString("str" + i), reader.readBytes()); } //simulate cutting the page cw.reset(); assertEquals(0, cw.getBufferedSize()); }
Example 11
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 5 votes |
private void roundTripLong(FallbackValuesWriter<PlainLongDictionaryValuesWriter, PlainValuesWriter> cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException { int fallBackThreshold = maxDictionaryByteSize / 8; for (long i = 0; i < 100; i++) { cw.writeLong(i); if (i < fallBackThreshold) { assertEquals(cw.getEncoding(), PLAIN_DICTIONARY); } else { assertEquals(cw.getEncoding(), PLAIN); } } reader.initFromPage(100, cw.getBytes().toInputStream()); for (long i = 0; i < 100; i++) { assertEquals(i, reader.readLong()); } // Test skip with plain encoding reader.initFromPage(100, cw.getBytes().toInputStream()); for (int i = 0; i < 100; i += 2) { assertEquals(i, reader.readLong()); reader.skip(); } // Test skip-n with plain encoding reader.initFromPage(100, cw.getBytes().toInputStream()); int skipCount; for (int i = 0; i < 100; i += skipCount + 1) { skipCount = (100 - i) / 2; assertEquals(i, reader.readLong()); reader.skip(skipCount); } }
Example 12
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 5 votes |
private void roundTripDouble(FallbackValuesWriter<PlainDoubleDictionaryValuesWriter, PlainValuesWriter> cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException { int fallBackThreshold = maxDictionaryByteSize / 8; for (double i = 0; i < 100; i++) { cw.writeDouble(i); if (i < fallBackThreshold) { assertEquals(cw.getEncoding(), PLAIN_DICTIONARY); } else { assertEquals(cw.getEncoding(), PLAIN); } } reader.initFromPage(100, cw.getBytes().toInputStream()); for (double i = 0; i < 100; i++) { assertEquals(i, reader.readDouble(), 0.00001); } // Test skip with plain encoding reader.initFromPage(100, cw.getBytes().toInputStream()); for (int i = 0; i < 100; i += 2) { assertEquals(i, reader.readDouble(), 0.0); reader.skip(); } // Test skip-n with plain encoding reader.initFromPage(100, cw.getBytes().toInputStream()); int skipCount; for (int i = 0; i < 100; i += skipCount + 1) { skipCount = (100 - i) / 2; assertEquals(i, reader.readDouble(), 0.0); reader.skip(skipCount); } }
Example 13
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 5 votes |
private void roundTripInt(FallbackValuesWriter<PlainIntegerDictionaryValuesWriter, PlainValuesWriter> cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException { int fallBackThreshold = maxDictionaryByteSize / 4; for (int i = 0; i < 100; i++) { cw.writeInteger(i); if (i < fallBackThreshold) { assertEquals(cw.getEncoding(), PLAIN_DICTIONARY); } else { assertEquals(cw.getEncoding(), PLAIN); } } reader.initFromPage(100, cw.getBytes().toInputStream()); for (int i = 0; i < 100; i++) { assertEquals(i, reader.readInteger()); } // Test skip with plain encoding reader.initFromPage(100, cw.getBytes().toInputStream()); for (int i = 0; i < 100; i += 2) { assertEquals(i, reader.readInteger()); reader.skip(); } // Test skip-n with plain encoding reader.initFromPage(100, cw.getBytes().toInputStream()); int skipCount; for (int i = 0; i < 100; i += skipCount + 1) { skipCount = (100 - i) / 2; assertEquals(i, reader.readInteger()); reader.skip(skipCount); } }
Example 14
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 5 votes |
private void roundTripFloat(FallbackValuesWriter<PlainFloatDictionaryValuesWriter, PlainValuesWriter> cw, ValuesReader reader, int maxDictionaryByteSize) throws IOException { int fallBackThreshold = maxDictionaryByteSize / 4; for (float i = 0; i < 100; i++) { cw.writeFloat(i); if (i < fallBackThreshold) { assertEquals(cw.getEncoding(), PLAIN_DICTIONARY); } else { assertEquals(cw.getEncoding(), PLAIN); } } reader.initFromPage(100, cw.getBytes().toInputStream()); for (float i = 0; i < 100; i++) { assertEquals(i, reader.readFloat(), 0.00001); } // Test skip with plain encoding reader.initFromPage(100, cw.getBytes().toInputStream()); for (int i = 0; i < 100; i += 2) { assertEquals(i, reader.readFloat(), 0.0f); reader.skip(); } // Test skip-n with plain encoding reader.initFromPage(100, cw.getBytes().toInputStream()); int skipCount; for (int i = 0; i < 100; i += skipCount + 1) { skipCount = (100 - i) / 2; assertEquals(i, reader.readFloat(), 0.0f); reader.skip(skipCount); } }
Example 15
Source File: TestBitPackingColumn.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validateEncodeDecode(int bitLength, int[] vals, String expected) throws IOException { for (PACKING_TYPE type : PACKING_TYPE.values()) { LOG.debug("{}", type); final int bound = (int)Math.pow(2, bitLength) - 1; ValuesWriter w = type.getWriter(bound); for (int i : vals) { w.writeInteger(i); } byte[] bytes = w.getBytes().toByteArray(); LOG.debug("vals ("+bitLength+"): " + TestBitPacking.toString(vals)); LOG.debug("bytes: {}", TestBitPacking.toString(bytes)); assertEquals(type.toString(), expected, TestBitPacking.toString(bytes)); ValuesReader r = type.getReader(bound); r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes))); int[] result = new int[vals.length]; for (int i = 0; i < result.length; i++) { result[i] = r.readInteger(); } LOG.debug("result: {}", TestBitPacking.toString(result)); assertArrayEquals(type + " result: " + TestBitPacking.toString(result), vals, result); // Test skipping r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes))); for (int i = 0; i < vals.length; i += 2) { assertEquals(vals[i], r.readInteger()); r.skip(); } // Test n-skipping r.initFromPage(vals.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(bytes))); int skipCount; for (int i = 0; i < vals.length; i += skipCount + 1) { skipCount = (vals.length - i) / 2; assertEquals(vals[i], r.readInteger()); r.skip(skipCount); } } }
Example 16
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 4 votes |
private void checkDistinct(int COUNT, BytesInput bytes, ValuesReader cr, String prefix) throws IOException { cr.initFromPage(COUNT, bytes.toInputStream()); for (int i = 0; i < COUNT; i++) { Assert.assertEquals(prefix + i, cr.readBytes().toStringUsingUTF8()); } }
Example 17
Source File: TestDictionary.java From parquet-mr with Apache License 2.0 | 4 votes |
private void checkRepeated(int COUNT, BytesInput bytes, ValuesReader cr, String prefix) throws IOException { cr.initFromPage(COUNT, bytes.toInputStream()); for (int i = 0; i < COUNT; i++) { Assert.assertEquals(prefix + i % 10, cr.readBytes().toStringUsingUTF8()); } }
Example 18
Source File: BenchmarkReadingRandomIntegers.java From parquet-mr with Apache License 2.0 | 4 votes |
private void readData(ValuesReader reader, byte[] deltaBytes) throws IOException { reader.initFromPage(data.length, ByteBufferInputStream.wrap(ByteBuffer.wrap(deltaBytes))); for (int i = 0; i < data.length; i++) { reader.readInteger(); } }