parquet.hadoop.metadata.BlockMetaData Java Examples
The following examples show how to use
parquet.hadoop.metadata.BlockMetaData.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParquetReader.java From paraflow with Apache License 2.0 | 5 votes |
public ParquetReader(MessageType fileSchema, MessageType requestedSchema, List<BlockMetaData> blocks, ParquetDataSource dataSource, TypeManager typeManager) { this.fileSchema = fileSchema; this.requestedSchema = requestedSchema; this.blocks = blocks; this.dataSource = dataSource; this.typeManager = typeManager; initializeColumnReaders(); }
Example #2
Source File: ParquetMetadataStat.java From rainbow with Apache License 2.0 | 5 votes |
/** * get the blocks (row groups) of the parquet files. * @return */ public List<BlockMetaData> getBlocks () { if (this.blockMetaDataList == null || this.blockMetaDataList.size() == 0) { this.blockMetaDataList = new ArrayList<BlockMetaData>(); for (ParquetFileMetadata meta : this.fileMetaDataList) { this.blockMetaDataList.addAll(meta.getBlocks()); } } return this.blockMetaDataList; }
Example #3
Source File: ParquetMetadataStat.java From rainbow with Apache License 2.0 | 5 votes |
/** * get the total compressed size of the parquet files. * @return */ @Override public long getTotalSize () { long size = 0; for (BlockMetaData meta : this.getBlocks()) { size += meta.getCompressedSize(); } return size; }
Example #4
Source File: TestParquetMetadataStat.java From rainbow with Apache License 2.0 | 5 votes |
@Test public void testGetStat () throws IOException, MetadataException { //ParquetMetadataStat stat = new ParquetMetadataStat("10.172.96.77", 9000, "/tmp/hive-root/hive_2015-02-04_10-57-36_131_1404874572956637570-1/_tmp.-ext-10002"); ParquetMetadataStat stat = new ParquetMetadataStat("192.168.124.15", 9000, "/msra/parquet_test"); double[] columnSize = stat.getAvgColumnChunkSize(); List<String> names = stat.getFieldNames(); int i = 0; double total = 0; for (double size: columnSize) { //System.out.println(names.get(i) + "\t" + size); total += size; i++; } System.out.println(total/1024/1024 + "\t" + stat.getRowGroupCount() + "\t" + stat.getFileCount()); for (BlockMetaData bm : stat.getBlocks()) { System.out.println(bm.getCompressedSize() + ", " + bm.getTotalByteSize() + ", " + bm.getRowCount()); } List<ParquetFileMetadata> metaDatas = stat.getFileMetaData(); System.out.println(metaDatas.get(0).getFileMetaData().getCreatedBy()); Map<String, String> keyValueMetaData = metaDatas.get(0).getFileMetaData().getKeyValueMetaData(); for (String key : keyValueMetaData.keySet()) { System.out.println(key + "=" + keyValueMetaData.get(key)); } for (int j = 0; j < names.size(); ++j) { System.out.println(names.get(j) + "\t" + columnSize[j] + "\n"); } }
Example #5
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 5 votes |
public static void showDetails(PrettyPrintWriter out, ParquetMetadata meta) { showDetails(out, meta.getFileMetaData()); long i = 1; for (BlockMetaData bmeta : meta.getBlocks()) { out.println(); showDetails(out, bmeta, i++); } }
Example #6
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 5 votes |
private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) { long rows = meta.getRowCount(); long tbs = meta.getTotalByteSize(); out.format("row group%s: RC:%d TS:%d%n", (num == null ? "" : " " + num), rows, tbs); out.rule('-'); showDetails(out, meta.getColumns()); }
Example #7
Source File: ParquetMetadataReader.java From paraflow with Apache License 2.0 | 4 votes |
public static ParquetMetadata readFooter(FileSystem fileSystem, Path file) throws IOException { FileStatus fileStatus = fileSystem.getFileStatus(file); try (FSDataInputStream inputStream = fileSystem.open(file)) { // Parquet File Layout: // // MAGIC // variable: Data // variable: Metadata // 4 bytes: MetadataLength // MAGIC long length = fileStatus.getLen(); validateParquet(length >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file); long metadataLengthIndex = length - PARQUET_METADATA_LENGTH - MAGIC.length; inputStream.seek(metadataLengthIndex); int metadataLength = readIntLittleEndian(inputStream); byte[] magic = new byte[MAGIC.length]; inputStream.readFully(magic); validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic)); long metadataIndex = metadataLengthIndex - metadataLength; validateParquet( metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex); inputStream.seek(metadataIndex); FileMetaData fileMetaData = readFileMetaData(inputStream); List<SchemaElement> schema = fileMetaData.getSchema(); validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file); MessageType messageType = readParquetSchema(schema); List<BlockMetaData> blocks = new ArrayList<>(); List<RowGroup> rowGroups = fileMetaData.getRow_groups(); if (rowGroups != null) { for (RowGroup rowGroup : rowGroups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup); String filePath = columns.get(0).getFile_path(); for (ColumnChunk columnChunk : columns) { validateParquet( (filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file"); ColumnMetaData metaData = columnChunk.meta_data; String[] path = metaData.path_in_schema.toArray(new String[metaData.path_in_schema.size()]); ColumnPath columnPath = ColumnPath.get(path); ColumnChunkMetaData column = ColumnChunkMetaData.get( columnPath, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName(), CompressionCodecName.fromParquet(metaData.codec), readEncodings(metaData.encodings), readStats(metaData.statistics, messageType.getType(columnPath.toArray()).asPrimitiveType().getPrimitiveTypeName()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); blocks.add(blockMetaData); } } Map<String, String> keyValueMetaData = new HashMap<>(); List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata(); if (keyValueList != null) { for (KeyValue keyValue : keyValueList) { keyValueMetaData.put(keyValue.key, keyValue.value); } } return new ParquetMetadata(new parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks); } }
Example #8
Source File: ParaflowPageSourceProvider.java From paraflow with Apache License 2.0 | 4 votes |
private Optional<ConnectorPageSource> createParaflowPageSource( Path path, long start, long length, List<ParaflowColumnHandle> columns) { Optional<FileSystem> fileSystemOptional = fsFactory.getFileSystem(); FileSystem fileSystem; ParquetDataSource dataSource; if (fileSystemOptional.isPresent()) { fileSystem = fileSystemOptional.get(); } else { throw new RuntimeException("Could not find filesystem for path " + path); } try { dataSource = buildHdfsParquetDataSource(fileSystem, path, start, length); // default length is file size, which means whole file is a split length = dataSource.getSize(); ParquetMetadata parquetMetadata = ParquetMetadataReader.readFooter(fileSystem, path); FileMetaData fileMetaData = parquetMetadata.getFileMetaData(); MessageType fileSchema = fileMetaData.getSchema(); List<Type> fields = columns.stream() .filter(column -> column.getColType() != ParaflowColumnHandle.ColumnType.NOTVALID) .map(column -> getParquetType(column, fileSchema)) .filter(Objects::nonNull) .collect(Collectors.toList()); MessageType requestedSchema = new MessageType(fileSchema.getName(), fields); List<BlockMetaData> blocks = new ArrayList<>(); for (BlockMetaData block : parquetMetadata.getBlocks()) { long firstDataPage = block.getColumns().get(0).getFirstDataPageOffset(); if (firstDataPage >= start && firstDataPage < start + length) { blocks.add(block); } } ParquetReader parquetReader = new ParquetReader( fileSchema, requestedSchema, blocks, dataSource, typeManager); return Optional.of(new ParaflowPageSource( parquetReader, dataSource, fileSchema, requestedSchema, length, columns, typeManager)); } catch (IOException e) { log.error(e); return Optional.empty(); } }
Example #9
Source File: ParquetFileMetadata.java From rainbow with Apache License 2.0 | 4 votes |
public List<BlockMetaData> getBlocks () { return this.metaData.getBlocks(); }
Example #10
Source File: TestSplitCombine.java From spork with Apache License 2.0 | 4 votes |
@Test public void test11() throws IOException, InterruptedException { // verify locations in order ArrayList<InputSplit> rawSplits = new ArrayList<InputSplit>(); // first split is parquetinputsplit rawSplits.add(new ParquetInputSplit(new Path("path1"), 0, 100, new String[] { "l1", "l2", "l3" }, new ArrayList<BlockMetaData>(), "", "", new HashMap<String, String>(), new HashMap<String, String>())); // second split is file split rawSplits.add(new FileSplit(new Path("path2"), 0, 400, new String[] { "l5", "l6", "l1" })); List<InputSplit> result = pigInputFormat.getPigSplits(rawSplits, 0, ok, null, true, conf); // pig combines two into one pigsplit Assert.assertEquals(result.size(), 1); for (InputSplit split : result) { PigSplit pigSplit = (PigSplit) split; // write to a byte array output stream ByteArrayOutputStream outputStream = new ByteArrayOutputStream(); DataOutput out = new DataOutputStream(outputStream); pigSplit.write(out); // restore the pig split from the byte array ByteArrayInputStream inputStream = new ByteArrayInputStream( outputStream.toByteArray()); DataInput in = new DataInputStream(inputStream); PigSplit anotherSplit = new PigSplit(); anotherSplit.setConf(conf); anotherSplit.readFields(in); Assert.assertEquals(500, anotherSplit.getLength()); Assert.assertEquals(2, anotherSplit.getNumPaths()); Assert.assertEquals("parquet.hadoop.ParquetInputSplit", (anotherSplit.getWrappedSplit(0).getClass().getName())); Assert.assertEquals( "org.apache.hadoop.mapreduce.lib.input.FileSplit", (anotherSplit.getWrappedSplit(1).getClass().getName())); } }
Example #11
Source File: MetadataUtils.java From parquet-tools with Apache License 2.0 | 4 votes |
public static void showDetails(PrettyPrintWriter out, BlockMetaData meta) { showDetails(out, meta, null); }