org.apache.parquet.format.FileMetaData Java Examples
The following examples show how to use
org.apache.parquet.format.FileMetaData.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * verifies that splits will end up being a partition of the rowgroup * they are all found only once * @param md * @param splitWidth */ private void verifyAllFilters(FileMetaData md, long splitWidth) { Set<Long> offsetsFound = new TreeSet<Long>(); for (long start = 0; start < fileSize(md); start += splitWidth) { FileMetaData filtered = filter(md, start, start + splitWidth); for (RowGroup rg : filtered.getRow_groups()) { long o = getOffset(rg); if (offsetsFound.contains(o)) { fail("found the offset twice: " + o); } else { offsetsFound.add(o); } } } if (offsetsFound.size() != md.row_groups.size()) { fail("missing row groups, " + "found: " + offsetsFound + "\nexpected " + md.getRow_groups()); } }
Example #2
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public FileMetaData toParquetMetadata(int currentVersion, ParquetMetadata parquetMetadata) { List<BlockMetaData> blocks = parquetMetadata.getBlocks(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); long numRows = 0; for (BlockMetaData block : blocks) { numRows += block.getRowCount(); addRowGroup(parquetMetadata, rowGroups, block); } FileMetaData fileMetaData = new FileMetaData( currentVersion, toParquetSchema(parquetMetadata.getFileMetaData().getSchema()), numRows, rowGroups); Set<Entry<String, String>> keyValues = parquetMetadata.getFileMetaData().getKeyValueMetaData().entrySet(); for (Entry<String, String> keyValue : keyValues) { addKeyValue(fileMetaData, keyValue.getKey(), keyValue.getValue()); } fileMetaData.setCreated_by(parquetMetadata.getFileMetaData().getCreatedBy()); fileMetaData.setColumn_orders(getColumnOrders(parquetMetadata.getFileMetaData().getSchema())); return fileMetaData; }
Example #3
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
static FileMetaData filterFileMetaDataByMidpoint(FileMetaData metaData, RangeMetadataFilter filter) { List<RowGroup> rowGroups = metaData.getRow_groups(); List<RowGroup> newRowGroups = new ArrayList<RowGroup>(); for (RowGroup rowGroup : rowGroups) { long totalSize = 0; long startIndex = getOffset(rowGroup.getColumns().get(0)); for (ColumnChunk col : rowGroup.getColumns()) { totalSize += col.getMeta_data().getTotal_compressed_size(); } long midPoint = startIndex + totalSize / 2; if (filter.contains(midPoint)) { newRowGroups.add(rowGroup); } } metaData.setRow_groups(newRowGroups); return metaData; }
Example #4
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private FileMetaData metadata(long... sizes) { List<SchemaElement> schema = emptyList(); List<RowGroup> rowGroups = new ArrayList<RowGroup>(); long offset = 0; for (long size : sizes) { ColumnChunk columnChunk = new ColumnChunk(offset); columnChunk.setMeta_data(new ColumnMetaData( INT32, Collections.<org.apache.parquet.format.Encoding>emptyList(), Collections.<String>emptyList(), UNCOMPRESSED, 10l, size * 2, size, offset)); rowGroups.add(new RowGroup(Arrays.asList(columnChunk), size, 1)); offset += size; } return new FileMetaData(1, schema, sizes.length, rowGroups); }
Example #5
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testParquetMetadataConverterWithoutDictionary() throws IOException { ParquetMetadata parquetMetaData = createParquetMetaData(null, Encoding.PLAIN); ParquetMetadataConverter converter = new ParquetMetadataConverter(); FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData); // Flag should be false fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> { assertFalse(column.meta_data.isSetDictionary_page_offset()); })); ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream(); Util.writeFileMetaData(fmd1, metaDataOutputStream); ByteArrayInputStream metaDataInputStream = new ByteArrayInputStream(metaDataOutputStream.toByteArray()); FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream); ParquetMetadata pmd2 = converter.fromParquetMetadata(fmd2); long dicOffsetConverted = pmd2.getBlocks().get(0).getColumns().get(0).getDictionaryPageOffset(); Assert.assertEquals(0, dicOffsetConverted); }
Example #6
Source File: ParquetReaderUtility.java From Bats with Apache License 2.0 | 6 votes |
/** * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects. * * @param footer Parquet file metadata * @return schema full path to SchemaElement map */ public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) { Map<String, SchemaElement> schemaElements = new HashMap<>(); FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer); Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator(); // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`, // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation if (iter.hasNext()) { iter.next(); } while (iter.hasNext()) { addSchemaElementMapping(iter, new StringBuilder(), schemaElements); } return schemaElements; }
Example #7
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testParquetMetadataConverterWithDictionary() throws IOException { ParquetMetadata parquetMetaData = createParquetMetaData(Encoding.PLAIN_DICTIONARY, Encoding.PLAIN); ParquetMetadataConverter converter = new ParquetMetadataConverter(); FileMetaData fmd1 = converter.toParquetMetadata(1, parquetMetaData); // Flag should be true fmd1.row_groups.forEach(rowGroup -> rowGroup.columns.forEach(column -> { assertTrue(column.meta_data.isSetDictionary_page_offset()); })); ByteArrayOutputStream metaDataOutputStream = new ByteArrayOutputStream(); Util.writeFileMetaData(fmd1, metaDataOutputStream); ByteArrayInputStream metaDataInputStream = new ByteArrayInputStream(metaDataOutputStream.toByteArray()); FileMetaData fmd2 = Util.readFileMetaData(metaDataInputStream); ParquetMetadata parquetMetaDataConverted = converter.fromParquetMetadata(fmd2); long dicOffsetOriginal = parquetMetaData.getBlocks().get(0).getColumns().get(0) .getDictionaryPageOffset(); long dicOffsetConverted = parquetMetaDataConverted.getBlocks().get(0).getColumns().get(0) .getDictionaryPageOffset(); Assert.assertEquals(dicOffsetOriginal, dicOffsetConverted); }
Example #8
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
private static ParquetMetadata createParquetMetaData(Encoding dicEncoding, Encoding dataEncoding) { MessageType schema = parseMessageType("message schema { optional int32 col (INT_32); }"); org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData = new org.apache.parquet.hadoop.metadata.FileMetaData(schema, new HashMap<String, String>(), null); List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>(); BlockMetaData blockMetaData = new BlockMetaData(); EncodingStats.Builder builder = new EncodingStats.Builder(); if (dicEncoding!= null) { builder.addDictEncoding(dicEncoding).build(); } builder.addDataEncoding(dataEncoding); EncodingStats es = builder.build(); Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>(); PrimitiveTypeName t = PrimitiveTypeName.INT32; ColumnPath p = ColumnPath.get("col"); CompressionCodecName c = CompressionCodecName.UNCOMPRESSED; BinaryStatistics s = new BinaryStatistics(); ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0); blockMetaData.addColumn(md); blockMetaDataList.add(blockMetaData); return new ParquetMetadata(fileMetaData, blockMetaDataList); }
Example #9
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
private long fileSize(FileMetaData md) { long size = 0; for (RowGroup rg : md.getRow_groups()) { size += rg.total_byte_size; } return size; }
Example #10
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
private void verifyMD(FileMetaData md, long... offsets) { assertEquals(offsets.length, md.row_groups.size()); for (int i = 0; i < offsets.length; i++) { long offset = offsets[i]; RowGroup rowGroup = md.getRow_groups().get(i); assertEquals(offset, getOffset(rowGroup)); } }
Example #11
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
static FileMetaData filterFileMetaDataByStart(FileMetaData metaData, OffsetMetadataFilter filter) { List<RowGroup> rowGroups = metaData.getRow_groups(); List<RowGroup> newRowGroups = new ArrayList<RowGroup>(); for (RowGroup rowGroup : rowGroups) { long startIndex = getOffset(rowGroup.getColumns().get(0)); if (filter.contains(startIndex)) { newRowGroups.add(rowGroup); } } metaData.setRow_groups(newRowGroups); return metaData; }
Example #12
Source File: ParquetReaderUtility.java From dremio-oss with Apache License 2.0 | 5 votes |
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) { HashMap<String, SchemaElement> schemaElements = new HashMap<>(); FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer); for (SchemaElement se : fileMetaData.getSchema()) { schemaElements.put(se.getName(), se); } return schemaElements; }
Example #13
Source File: ParquetWriter.java From presto with Apache License 2.0 | 5 votes |
static Slice getFooter(List<RowGroup> rowGroups, MessageType messageType) throws IOException { FileMetaData fileMetaData = new FileMetaData(); fileMetaData.setVersion(1); fileMetaData.setSchema(MessageTypeConverter.toParquetSchema(messageType)); long totalRows = rowGroups.stream().mapToLong(RowGroup::getNum_rows).sum(); fileMetaData.setNum_rows(totalRows); fileMetaData.setRow_groups(ImmutableList.copyOf(rowGroups)); DynamicSliceOutput dynamicSliceOutput = new DynamicSliceOutput(40); Util.writeFileMetaData(fileMetaData, dynamicSliceOutput); return dynamicSliceOutput.slice(); }
Example #14
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
public ParquetMetadata fromParquetMetadata(FileMetaData parquetMetadata) throws IOException { MessageType messageType = fromParquetSchema(parquetMetadata.getSchema(), parquetMetadata.getColumn_orders()); List<BlockMetaData> blocks = new ArrayList<BlockMetaData>(); List<RowGroup> row_groups = parquetMetadata.getRow_groups(); if (row_groups != null) { for (RowGroup rowGroup : row_groups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); String filePath = columns.get(0).getFile_path(); for (ColumnChunk columnChunk : columns) { if ((filePath == null && columnChunk.getFile_path() != null) || (filePath != null && !filePath.equals(columnChunk.getFile_path()))) { throw new ParquetDecodingException("all column chunks of the same row group must be in the same file for now"); } ColumnMetaData metaData = columnChunk.meta_data; ColumnPath path = getPath(metaData); ColumnChunkMetaData column = ColumnChunkMetaData.get( path, messageType.getType(path.toArray()).asPrimitiveType(), fromFormatCodec(metaData.codec), convertEncodingStats(metaData.getEncoding_stats()), fromFormatEncodings(metaData.encodings), fromParquetStatistics( parquetMetadata.getCreated_by(), metaData.statistics, messageType.getType(path.toArray()).asPrimitiveType()), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); column.setColumnIndexReference(toColumnIndexReference(columnChunk)); column.setOffsetIndexReference(toOffsetIndexReference(columnChunk)); column.setBloomFilterOffset(metaData.bloom_filter_offset); // TODO // index_page_offset // key_value_metadata blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); blocks.add(blockMetaData); } } Map<String, String> keyValueMetaData = new HashMap<String, String>(); List<KeyValue> key_value_metadata = parquetMetadata.getKey_value_metadata(); if (key_value_metadata != null) { for (KeyValue keyValue : key_value_metadata) { keyValueMetaData.put(keyValue.key, keyValue.value); } } return new ParquetMetadata( new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, parquetMetadata.getCreated_by()), blocks); }
Example #15
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
private FileMetaData filter(FileMetaData md, long start, long end) { return filterFileMetaDataByMidpoint(new FileMetaData(md), new ParquetMetadataConverter.RangeMetadataFilter(start, end)); }
Example #16
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
private FileMetaData find(FileMetaData md, Long... blockStart) { return filterFileMetaDataByStart(new FileMetaData(md), new ParquetMetadataConverter.OffsetMetadataFilter( Sets.newHashSet((Long[]) blockStart))); }
Example #17
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
private FileMetaData find(FileMetaData md, long blockStart) { return filterFileMetaDataByStart(new FileMetaData(md), new ParquetMetadataConverter.OffsetMetadataFilter( Sets.newHashSet(blockStart))); }
Example #18
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
private static void addKeyValue(FileMetaData fileMetaData, String key, String value) { KeyValue keyValue = new KeyValue(key); keyValue.value = value; fileMetaData.addToKey_value_metadata(keyValue); }
Example #19
Source File: MetadataReader.java From presto with Apache License 2.0 | 4 votes |
public static ParquetMetadata readFooter(FSDataInputStream inputStream, Path file, long fileSize) throws IOException { // Parquet File Layout: // // MAGIC // variable: Data // variable: Metadata // 4 bytes: MetadataLength // MAGIC validateParquet(fileSize >= MAGIC.length + PARQUET_METADATA_LENGTH + MAGIC.length, "%s is not a valid Parquet File", file); long metadataLengthIndex = fileSize - PARQUET_METADATA_LENGTH - MAGIC.length; InputStream footerStream = readFully(inputStream, metadataLengthIndex, PARQUET_METADATA_LENGTH + MAGIC.length); int metadataLength = readIntLittleEndian(footerStream); byte[] magic = new byte[MAGIC.length]; footerStream.read(magic); validateParquet(Arrays.equals(MAGIC, magic), "Not valid Parquet file: %s expected magic number: %s got: %s", file, Arrays.toString(MAGIC), Arrays.toString(magic)); long metadataIndex = metadataLengthIndex - metadataLength; validateParquet( metadataIndex >= MAGIC.length && metadataIndex < metadataLengthIndex, "Corrupted Parquet file: %s metadata index: %s out of range", file, metadataIndex); InputStream metadataStream = readFully(inputStream, metadataIndex, metadataLength); FileMetaData fileMetaData = readFileMetaData(metadataStream); List<SchemaElement> schema = fileMetaData.getSchema(); validateParquet(!schema.isEmpty(), "Empty Parquet schema in file: %s", file); MessageType messageType = readParquetSchema(schema); List<BlockMetaData> blocks = new ArrayList<>(); List<RowGroup> rowGroups = fileMetaData.getRow_groups(); if (rowGroups != null) { for (RowGroup rowGroup : rowGroups) { BlockMetaData blockMetaData = new BlockMetaData(); blockMetaData.setRowCount(rowGroup.getNum_rows()); blockMetaData.setTotalByteSize(rowGroup.getTotal_byte_size()); List<ColumnChunk> columns = rowGroup.getColumns(); validateParquet(!columns.isEmpty(), "No columns in row group: %s", rowGroup); String filePath = columns.get(0).getFile_path(); for (ColumnChunk columnChunk : columns) { validateParquet( (filePath == null && columnChunk.getFile_path() == null) || (filePath != null && filePath.equals(columnChunk.getFile_path())), "all column chunks of the same row group must be in the same file"); ColumnMetaData metaData = columnChunk.meta_data; String[] path = metaData.path_in_schema.stream() .map(value -> value.toLowerCase(Locale.ENGLISH)) .toArray(String[]::new); ColumnPath columnPath = ColumnPath.get(path); PrimitiveType primitiveType = messageType.getType(columnPath.toArray()).asPrimitiveType(); ColumnChunkMetaData column = ColumnChunkMetaData.get( columnPath, primitiveType, CompressionCodecName.fromParquet(metaData.codec), PARQUET_METADATA_CONVERTER.convertEncodingStats(metaData.encoding_stats), readEncodings(metaData.encodings), readStats(Optional.ofNullable(fileMetaData.getCreated_by()), Optional.ofNullable(metaData.statistics), primitiveType), metaData.data_page_offset, metaData.dictionary_page_offset, metaData.num_values, metaData.total_compressed_size, metaData.total_uncompressed_size); blockMetaData.addColumn(column); } blockMetaData.setPath(filePath); blocks.add(blockMetaData); } } Map<String, String> keyValueMetaData = new HashMap<>(); List<KeyValue> keyValueList = fileMetaData.getKey_value_metadata(); if (keyValueList != null) { for (KeyValue keyValue : keyValueList) { keyValueMetaData.put(keyValue.key, keyValue.value); } } return new ParquetMetadata(new org.apache.parquet.hadoop.metadata.FileMetaData(messageType, keyValueMetaData, fileMetaData.getCreated_by()), blocks); }