Java Code Examples for org.apache.parquet.schema.MessageType#getColumnDescription()
The following examples show how to use
org.apache.parquet.schema.MessageType#getColumnDescription() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath) { String name = Strings.repeat(".", depth) + type.getName(); OriginalType otype = type.getOriginalType(); Repetition rep = type.getRepetition(); PrimitiveTypeName ptype = type.getPrimitiveTypeName(); out.format("%s: %s %s", name, rep, ptype); if (otype != null) out.format(" O:%s", otype); if (container != null) { cpath.add(type.getName()); String[] paths = cpath.toArray(new String[0]); cpath.remove(cpath.size() - 1); ColumnDescriptor desc = container.getColumnDescription(paths); int defl = desc.getMaxDefinitionLevel(); int repl = desc.getMaxRepetitionLevel(); out.format(" R:%d D:%d", repl, defl); } out.println(); }
Example 2
Source File: TestMemColumn.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testMemColumn() throws Exception { MessageType schema = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }"); ColumnDescriptor path = schema.getColumnDescription(new String[] {"foo", "bar"}); MemPageStore memPageStore = new MemPageStore(10); ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore); ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path); columnWriter.write(42l, 0, 0); memColumnsStore.endRecord(); memColumnsStore.flush(); ColumnReader columnReader = getColumnReader(memPageStore, path, schema); for (int i = 0; i < columnReader.getTotalValueCount(); i++) { assertEquals(columnReader.getCurrentRepetitionLevel(), 0); assertEquals(columnReader.getCurrentDefinitionLevel(), 0); assertEquals(columnReader.getLong(), 42); columnReader.consume(); } }
Example 3
Source File: TestMemColumn.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testMemColumnBinary() throws Exception { MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required binary bar; } }"); String[] col = new String[]{"foo", "bar"}; MemPageStore memPageStore = new MemPageStore(10); ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore); ColumnDescriptor path1 = mt.getColumnDescription(col); ColumnDescriptor path = path1; ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path); columnWriter.write(Binary.fromString("42"), 0, 0); memColumnsStore.endRecord(); memColumnsStore.flush(); ColumnReader columnReader = getColumnReader(memPageStore, path, mt); for (int i = 0; i < columnReader.getTotalValueCount(); i++) { assertEquals(columnReader.getCurrentRepetitionLevel(), 0); assertEquals(columnReader.getCurrentDefinitionLevel(), 0); assertEquals(columnReader.getBinary().toStringUsingUTF8(), "42"); columnReader.consume(); } }
Example 4
Source File: TestMemColumn.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testMemColumnSeveralPages() throws Exception { MessageType mt = MessageTypeParser.parseMessageType("message msg { required group foo { required int64 bar; } }"); String[] col = new String[]{"foo", "bar"}; MemPageStore memPageStore = new MemPageStore(10); ColumnWriteStoreV1 memColumnsStore = newColumnWriteStoreImpl(memPageStore); ColumnDescriptor path1 = mt.getColumnDescription(col); ColumnDescriptor path = path1; ColumnWriter columnWriter = memColumnsStore.getColumnWriter(path); for (int i = 0; i < 2000; i++) { columnWriter.write(42l, 0, 0); memColumnsStore.endRecord(); } memColumnsStore.flush(); ColumnReader columnReader = getColumnReader(memPageStore, path, mt); for (int i = 0; i < columnReader.getTotalValueCount(); i++) { assertEquals(columnReader.getCurrentRepetitionLevel(), 0); assertEquals(columnReader.getCurrentDefinitionLevel(), 0); assertEquals(columnReader.getLong(), 42); columnReader.consume(); } }
Example 5
Source File: PrintFooter.java From parquet-mr with Apache License 2.0 | 6 votes |
private static void add(ParquetMetadata footer) { for (BlockMetaData blockMetaData : footer.getBlocks()) { ++ blockCount; MessageType schema = footer.getFileMetaData().getSchema(); recordCount += blockMetaData.getRowCount(); List<ColumnChunkMetaData> columns = blockMetaData.getColumns(); for (ColumnChunkMetaData columnMetaData : columns) { ColumnDescriptor desc = schema.getColumnDescription(columnMetaData.getPath().toArray()); add( desc, columnMetaData.getValueCount(), columnMetaData.getTotalSize(), columnMetaData.getTotalUncompressedSize(), columnMetaData.getEncodings(), columnMetaData.getStatistics()); } } }
Example 6
Source File: MetadataUtils.java From parquet-mr with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, PrimitiveType type, int depth, MessageType container, List<String> cpath, boolean showOriginalTypes) { String name = Strings.repeat(".", depth) + type.getName(); Repetition rep = type.getRepetition(); PrimitiveTypeName ptype = type.getPrimitiveTypeName(); out.format("%s: %s %s", name, rep, ptype); if (showOriginalTypes) { OriginalType otype; try { otype = type.getOriginalType(); } catch (Exception e) { otype = null; } if (otype != null) out.format(" O:%s", otype); } else { LogicalTypeAnnotation ltype = type.getLogicalTypeAnnotation(); if (ltype != null) out.format(" L:%s", ltype); } if (container != null) { cpath.add(type.getName()); String[] paths = cpath.toArray(new String[0]); cpath.remove(cpath.size() - 1); ColumnDescriptor desc = container.getColumnDescription(paths); int defl = desc.getMaxDefinitionLevel(); int repl = desc.getMaxRepetitionLevel(); out.format(" R:%d D:%d", repl, defl); } out.println(); }
Example 7
Source File: TestInputFormat.java From parquet-mr with Apache License 2.0 | 5 votes |
private void createParquetFile(File file) throws IOException { Path path = new Path(file.toURI()); Configuration configuration = new Configuration(); MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}"); String[] columnPath = {"a", "b"}; ColumnDescriptor c1 = schema.getColumnDescription(columnPath); byte[] bytes1 = { 0, 1, 2, 3}; byte[] bytes2 = { 2, 3, 4, 5}; CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED; BinaryStatistics stats = new BinaryStatistics(); ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path); w.start(); w.startBlock(3); w.startColumn(c1, 5, codec); w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); w.startBlock(4); w.startColumn(c1, 7, codec); w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); w.end(new HashMap<String, String>()); }
Example 8
Source File: TestParquetFileWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testBloomFilterWriteRead() throws Exception { MessageType schema = MessageTypeParser.parseMessageType("message test { required binary foo; }"); File testFile = temp.newFile(); testFile.delete(); Path path = new Path(testFile.toURI()); Configuration configuration = new Configuration(); configuration.set("parquet.bloom.filter.column.names", "foo"); String[] colPath = {"foo"}; ColumnDescriptor col = schema.getColumnDescription(colPath); BinaryStatistics stats1 = new BinaryStatistics(); ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path); w.start(); w.startBlock(3); w.startColumn(col, 5, CODEC); w.writeDataPage(2, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(BYTES1),stats1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); BloomFilter blockSplitBloomFilter = new BlockSplitBloomFilter(0); blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("hello"))); blockSplitBloomFilter.insertHash(blockSplitBloomFilter.hash(Binary.fromString("world"))); w.addBloomFilter("foo", blockSplitBloomFilter); w.endBlock(); w.end(new HashMap<>()); ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path); ParquetFileReader r = new ParquetFileReader(configuration, readFooter.getFileMetaData(), path, Arrays.asList(readFooter.getBlocks().get(0)), Arrays.asList(schema.getColumnDescription(colPath))); BloomFilterReader bloomFilterReader = r.getBloomFilterDataReader(readFooter.getBlocks().get(0)); BloomFilter bloomFilter = bloomFilterReader.readBloomFilter(readFooter.getBlocks().get(0).getColumns().get(0)); assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("hello")))); assertTrue(bloomFilter.findHash(blockSplitBloomFilter.hash(Binary.fromString("world")))); }
Example 9
Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0 | 5 votes |
private void printColumnChunk(Logger console, int width, ColumnChunkMetaData column, MessageType schema) { String[] path = column.getPath().toArray(); PrimitiveType type = primitive(schema, path); Preconditions.checkNotNull(type); ColumnDescriptor desc = schema.getColumnDescription(path); long size = column.getTotalSize(); long count = column.getValueCount(); float perValue = ((float) size) / count; CompressionCodecName codec = column.getCodec(); Set<Encoding> encodings = column.getEncodings(); EncodingStats encodingStats = column.getEncodingStats(); String encodingSummary = encodingStats == null ? encodingsAsString(encodings, desc) : encodingStatsAsString(encodingStats); Statistics stats = column.getStatistics(); String name = column.getPath().toDotString(); PrimitiveType.PrimitiveTypeName typeName = type.getPrimitiveTypeName(); if (typeName == PrimitiveType.PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY) { console.info(String.format("%-" + width + "s FIXED[%d] %s %-7s %-9d %-8s %-7s %s", name, type.getTypeLength(), shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats))); } else { console.info(String.format("%-" + width + "s %-9s %s %-7s %-9d %-10s %-7s %s", name, typeName, shortCodec(codec), encodingSummary, count, humanReadable(perValue), stats == null || !stats.isNumNullsSet() ? "" : String.valueOf(stats.getNumNulls()), minMaxAsString(stats))); } }
Example 10
Source File: TestParquetFileWriter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Test public void testWriteReadStatistics() throws Exception { // this test assumes statistics will be read Assume.assumeTrue(!shouldIgnoreStatistics(Version.FULL_VERSION, BINARY)); File testFile = temp.newFile(); testFile.delete(); Path path = new Path(testFile.toURI()); Configuration configuration = new Configuration(); configuration.setBoolean("parquet.strings.signed-min-max.enabled", true); MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b (UTF8);} required group c { required int64 d; }}"); String[] path1 = {"a", "b"}; ColumnDescriptor c1 = schema.getColumnDescription(path1); String[] path2 = {"c", "d"}; ColumnDescriptor c2 = schema.getColumnDescription(path2); byte[] bytes1 = { 0, 1, 2, 3}; byte[] bytes2 = { 1, 2, 3, 4}; byte[] bytes3 = { 2, 3, 4, 5}; byte[] bytes4 = { 3, 4, 5, 6}; CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED; BinaryStatistics statsB1C1P1 = new BinaryStatistics(); BinaryStatistics statsB1C1P2 = new BinaryStatistics(); LongStatistics statsB1C2P1 = new LongStatistics(); LongStatistics statsB1C2P2 = new LongStatistics(); BinaryStatistics statsB2C1P1 = new BinaryStatistics(); LongStatistics statsB2C2P1 = new LongStatistics(); statsB1C1P1.setMinMax(Binary.fromString("s"), Binary.fromString("z")); statsB1C1P2.setMinMax(Binary.fromString("a"), Binary.fromString("b")); statsB1C2P1.setMinMax(2l, 10l); statsB1C2P2.setMinMax(-6l, 4l); statsB2C1P1.setMinMax(Binary.fromString("d"), Binary.fromString("e")); statsB2C2P1.setMinMax(11l, 122l); ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path); w.start(); w.startBlock(3); w.startColumn(c1, 5, codec); w.writeDataPage(2, 4, BytesInput.from(bytes1), statsB1C1P1, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(bytes1), statsB1C1P2, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(c2, 6, codec); w.writeDataPage(3, 4, BytesInput.from(bytes2), statsB1C2P1, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(1, 4, BytesInput.from(bytes2), statsB1C2P2, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); w.startBlock(4); w.startColumn(c1, 7, codec); w.writeDataPage(7, 4, BytesInput.from(bytes3), statsB2C1P1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(c2, 8, codec); w.writeDataPage(8, 4, BytesInput.from(bytes4), statsB2C2P1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); w.end(new HashMap<String, String>()); ParquetMetadata readFooter = ParquetFileReader.readFooter(configuration, path); for (BlockMetaData block : readFooter.getBlocks()) { for (ColumnChunkMetaData col : block.getColumns()) { col.getPath(); } } // correct statistics BinaryStatistics bs1 = new BinaryStatistics(); bs1.setMinMax(Binary.fromString("a"), Binary.fromString("z")); LongStatistics ls1 = new LongStatistics(); ls1.setMinMax(-6l, 10l); BinaryStatistics bs2 = new BinaryStatistics(); bs2.setMinMax(Binary.fromString("d"), Binary.fromString("e")); LongStatistics ls2 = new LongStatistics(); ls2.setMinMax(11l, 122l); { // assert stats are correct for the first block BinaryStatistics bsout = (BinaryStatistics)readFooter.getBlocks().get(0).getColumns().get(0).getStatistics(); String str = new String(bsout.getMaxBytes()); String str2 = new String(bsout.getMinBytes()); TestUtils.assertStatsValuesEqual(bs1, readFooter.getBlocks().get(0).getColumns().get(0).getStatistics()); TestUtils.assertStatsValuesEqual(ls1, readFooter.getBlocks().get(0).getColumns().get(1).getStatistics()); } { // assert stats are correct for the second block TestUtils.assertStatsValuesEqual(bs2, readFooter.getBlocks().get(1).getColumns().get(0).getStatistics()); TestUtils.assertStatsValuesEqual(ls2, readFooter.getBlocks().get(1).getColumns().get(1).getStatistics()); } }
Example 11
Source File: TestParquetFileWriter.java From parquet-mr with Apache License 2.0 | 4 votes |
private void createFile(Configuration configuration, Path path, MessageType schema) throws IOException { String[] path1 = {"a", "b"}; ColumnDescriptor c1 = schema.getColumnDescription(path1); String[] path2 = {"c", "d"}; ColumnDescriptor c2 = schema.getColumnDescription(path2); byte[] bytes1 = { 0, 1, 2, 3}; byte[] bytes2 = { 1, 2, 3, 4}; byte[] bytes3 = { 2, 3, 4, 5}; byte[] bytes4 = { 3, 4, 5, 6}; CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED; BinaryStatistics stats1 = new BinaryStatistics(); BinaryStatistics stats2 = new BinaryStatistics(); ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path); w.start(); w.startBlock(3); w.startColumn(c1, 5, codec); w.writeDataPage(2, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(bytes1), stats1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(c2, 6, codec); w.writeDataPage(2, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(3, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN); w.writeDataPage(1, 4, BytesInput.from(bytes2), stats2, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); w.startBlock(4); w.startColumn(c1, 7, codec); w.writeDataPage(7, 4, BytesInput.from(bytes3), stats1, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.startColumn(c2, 8, codec); w.writeDataPage(8, 4, BytesInput.from(bytes4), stats2, BIT_PACKED, BIT_PACKED, PLAIN); w.endColumn(); w.endBlock(); final HashMap<String, String> extraMetaData = new HashMap<String, String>(); extraMetaData.put("foo", "bar"); extraMetaData.put(path.getName(), path.getName()); w.end(extraMetaData); }
Example 12
Source File: Util.java From parquet-mr with Apache License 2.0 | 4 votes |
public static ColumnDescriptor descriptor(String column, MessageType schema) { String[] path = Iterables.toArray(DOT.split(column), String.class); Preconditions.checkArgument(schema.containsPath(path), "Schema doesn't have column: " + column); return schema.getColumnDescription(path); }