org.apache.parquet.format.converter.ParquetMetadataConverter Java Exaples

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

public ParquetFileReader(InputFile file, ParquetReadOptions options) throws IOException {
  this.converter = new ParquetMetadataConverter(options);
  this.file = file;
  this.f = file.newStream();
  this.options = options;
  try {
    this.footer = readFooter(file, options, f, converter);
  } catch (Exception e) {
    // In case that reading footer throws an exception in the constructor, the new stream
    // should be closed. Otherwise, there's no way to close this outside.
    f.close();
    throw e;
  }
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: ParquetUtils.java From incubator-pinot with Apache License 2.0

6 votes

/**
 * Returns the schema for the given Parquet file path.
 */
public static Schema getParquetSchema(Path path)
    throws IOException {
  ParquetMetadata footer = ParquetFileReader.readFooter(getConfiguration(), path, ParquetMetadataConverter.NO_FILTER);
  Map<String, String> metaData = footer.getFileMetaData().getKeyValueMetaData();
  String schemaString = metaData.get("parquet.avro.schema");
  if (schemaString == null) {
    // Try the older property
    schemaString = metaData.get("avro.schema");
  }
  if (schemaString != null) {
    return new Schema.Parser().parse(schemaString);
  } else {
    return new AvroSchemaConverter().convert(footer.getFileMetaData().getSchema());
  }
}

Source File: SchemaCommand.java From parquet-mr with Apache License 2.0

6 votes

private String getParquetSchema(String source) throws IOException {
  Formats.Format format;
  try (SeekableInput in = openSeekable(source)) {
    format = Formats.detectFormat((InputStream) in);
    in.seek(0);

    switch (format) {
      case PARQUET:
        return new ParquetFileReader(
            getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER)
            .getFileMetaData().getSchema().toString();
      default:
        throw new IllegalArgumentException(String.format(
            "Could not get a Parquet schema for format %s: %s", format, source));
    }
  }
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void test() throws IOException {
  Configuration configuration = new Configuration();
  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration,
      super.fsPath, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader reader = new ParquetFileReader(configuration,
    metadata.getFileMetaData(),
    super.fsPath,
    metadata.getBlocks(),
    metadata.getFileMetaData().getSchema().getColumns());

  PageStatsValidator validator = new PageStatsValidator();

  PageReadStore pageReadStore;
  while ((pageReadStore = reader.readNextRowGroup()) != null) {
    validator.validate(metadata.getFileMetaData().getSchema(), pageReadStore);
  }
}

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

6 votes

private Statistics convertStatistics(String createdBy, PrimitiveType type, org.apache.parquet.format.Statistics pageStatistics,
                                     ColumnIndex columnIndex, int pageIndex, ParquetMetadataConverter converter) throws IOException {
  if (columnIndex != null) {
    if (columnIndex.getNullPages() == null) {
      throw new IOException("columnIndex has null variable 'nullPages' which indicates corrupted data for type: " +  type.getName());
    }
    if (pageIndex > columnIndex.getNullPages().size()) {
      throw new IOException("There are more pages " + pageIndex + " found in the column than in the columnIndex " + columnIndex.getNullPages().size());
    }
    org.apache.parquet.column.statistics.Statistics.Builder statsBuilder = org.apache.parquet.column.statistics.Statistics.getBuilderForReading(type);
    statsBuilder.withNumNulls(columnIndex.getNullCounts().get(pageIndex));

    if (!columnIndex.getNullPages().get(pageIndex)) {
      statsBuilder.withMin(columnIndex.getMinValues().get(pageIndex).array().clone());
      statsBuilder.withMax(columnIndex.getMaxValues().get(pageIndex).array().clone());
    }
    return statsBuilder.build();
  } else if (pageStatistics != null) {
    return converter.fromParquetStatistics(createdBy, pageStatistics, type);
  } else {
    return null;
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @param files a list of files to merge metadata from
 * @param conf a configuration
 * @return merged parquet metadata for the files
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files,  Configuration conf) throws IOException {
  Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

  GlobalMetaData globalMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  for (Path p : files) {
    ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
    FileMetaData fmd = pmd.getFileMetaData();
    globalMetaData = mergeInto(fmd, globalMetaData, true);
    blocks.addAll(pmd.getBlocks());
  }

  // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
  return new ParquetMetadata(globalMetaData.merge(), blocks);
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeBloomFilters(
  List<Map<String, BloomFilter>> bloomFilters,
  List<BlockMetaData> blocks,
  PositionOutputStream out) throws IOException {
  LOG.debug("{}: bloom filters", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    Map<String, BloomFilter> blockBloomFilters = bloomFilters.get(bIndex);
    if (blockBloomFilters.isEmpty()) continue;
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      BloomFilter bloomFilter = blockBloomFilters.get(column.getPath().toDotString());
      if (bloomFilter == null) {
        continue;
      }

      long offset = out.getPos();
      column.setBloomFilterOffset(offset);
      Util.writeBloomFilterHeader(ParquetMetadataConverter.toBloomFilterHeader(bloomFilter), out);
      bloomFilter.writeTo(out);
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeOffsetIndexes(
    List<List<OffsetIndex>> offsetIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: offset indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<OffsetIndex> blockOffsetIndexes = offsetIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      OffsetIndex offsetIndex = blockOffsetIndexes.get(cIndex);
      if (offsetIndex == null) {
        continue;
      }
      ColumnChunkMetaData column = columns.get(cIndex);
      long offset = out.getPos();
      Util.writeOffsetIndex(ParquetMetadataConverter.toParquetOffsetIndex(offsetIndex), out);
      column.setOffsetIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

private static void serializeColumnIndexes(
    List<List<ColumnIndex>> columnIndexes,
    List<BlockMetaData> blocks,
    PositionOutputStream out) throws IOException {
  LOG.debug("{}: column indexes", out.getPos());
  for (int bIndex = 0, bSize = blocks.size(); bIndex < bSize; ++bIndex) {
    List<ColumnChunkMetaData> columns = blocks.get(bIndex).getColumns();
    List<ColumnIndex> blockColumnIndexes = columnIndexes.get(bIndex);
    for (int cIndex = 0, cSize = columns.size(); cIndex < cSize; ++cIndex) {
      ColumnChunkMetaData column = columns.get(cIndex);
      org.apache.parquet.format.ColumnIndex columnIndex = ParquetMetadataConverter
          .toParquetColumnIndex(column.getPrimitiveType(), blockColumnIndexes.get(cIndex));
      if (columnIndex == null) {
        continue;
      }
      long offset = out.getPos();
      Util.writeColumnIndex(columnIndex, out);
      column.setColumnIndexReference(new IndexReference(offset, (int) (out.getPos() - offset)));
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * FOR TESTING ONLY. This supports testing block padding behavior on the local FS.
 *
 * @param configuration Hadoop configuration
 * @param schema the schema of the data
 * @param file the file to write to
 * @param rowAndBlockSize the row group size
 * @param maxPaddingSize the maximum padding
 * @throws IOException if the file can not be created
 */
ParquetFileWriter(Configuration configuration, MessageType schema,
                  Path file, long rowAndBlockSize, int maxPaddingSize)
    throws IOException {
  FileSystem fs = file.getFileSystem(configuration);
  this.schema = schema;
  this.alignment = PaddingAlignment.get(
      rowAndBlockSize, rowAndBlockSize, maxPaddingSize);
  this.out = HadoopStreams.wrap(
      fs.create(file, true, 8192, fs.getDefaultReplication(file), rowAndBlockSize));
  this.encodingStatsBuilder = new EncodingStats.Builder();
  // no truncation is needed for testing
  this.columnIndexTruncateLength = Integer.MAX_VALUE;
  this.pageWriteChecksumEnabled = ParquetOutputFormat.getPageWriteChecksumEnabled(configuration);
  this.crc = pageWriteChecksumEnabled ? new CRC32() : null;
  this.metadataConverter = new ParquetMetadataConverter(ParquetProperties.DEFAULT_STATISTICS_TRUNCATE_LENGTH);
}

Source File: ParquetReaderUtility.java From Bats with Apache License 2.0

6 votes

/**
 * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
 *
 * @param footer Parquet file metadata
 * @return       schema full path to SchemaElement map
 */
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  Map<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

  Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();

  // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
  // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
  if (iter.hasNext()) {
    iter.next();
  }
  while (iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
  }
  return schemaElements;
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param conf the Hadoop Configuration
 * @param file Path to a parquet file
 * @param footer a {@link ParquetMetadata} footer already read from the file
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(Configuration conf, Path file, ParquetMetadata footer) throws IOException {
  this.converter = new ParquetMetadataConverter(conf);
  this.file = HadoopInputFile.fromPath(file, conf);
  this.f = this.file.newStream();
  this.options = HadoopReadOptions.builder(conf).build();
  this.footer = footer;
  this.fileMetaData = footer.getFileMetaData();
  this.blocks = filterRowGroups(footer.getBlocks());
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : footer.getFileMetaData().getSchema().getColumns()) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testPruneNestedParentColumn() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove parent column. All of it's children will be removed.
  String cargs[] = {inputFile, outputFile, "Links"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 3);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Name");
  assertEquals(fields.get(2).getName(), "Gender");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Links");
  validateColumns(inputFile, prunePaths);
}

Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0

6 votes

@Test
public void testPruneMultiColumns() throws Exception {
  // Create Parquet file
  String inputFile = createParquetFile("input");
  String outputFile = createTempFile("output");

  // Remove columns
  String cargs[] = {inputFile, outputFile, "Name", "Gender"};
  executeCommandLine(cargs);

  // Verify the schema are not changed for the columns not pruned
  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER);
  MessageType schema = pmd.getFileMetaData().getSchema();
  List<Type> fields = schema.getFields();
  assertEquals(fields.size(), 2);
  assertEquals(fields.get(0).getName(), "DocId");
  assertEquals(fields.get(1).getName(), "Links");
  List<Type> subFields = fields.get(1).asGroupType().getFields();
  assertEquals(subFields.size(), 2);
  assertEquals(subFields.get(0).getName(), "Backward");
  assertEquals(subFields.get(1).getName(), "Forward");

  // Verify the data are not changed for the columns not pruned
  List<String> prunePaths = Arrays.asList("Name", "Gender");
  validateColumns(inputFile, prunePaths);
}

Source File: PrimitiveColumnWriter.java From presto with Apache License 2.0

6 votes

private ColumnMetaData getColumnMetaData()
{
    checkState(getDataStreamsCalled);

    ColumnMetaData columnMetaData = new ColumnMetaData(
            ParquetTypeConverter.getType(columnDescriptor.getPrimitiveType().getPrimitiveTypeName()),
            encodings.stream().map(parquetMetadataConverter::getEncoding).collect(toImmutableList()),
            ImmutableList.copyOf(columnDescriptor.getPath()),
            compressionCodec.getParquetCompressionCodec(),
            totalRows,
            totalUnCompressedSize,
            totalCompressedSize,
            -1);
    columnMetaData.setStatistics(ParquetMetadataConverter.toParquetStatistics(columnStatistics));
    return columnMetaData;
}

Source File: TestParquetReader.java From dremio-oss with Apache License 2.0

6 votes

@Test
public void testArrowSchemaOldInFooter() throws Exception {
  URL badparquet = getClass().getResource("/types.parquet");

  Path filePathBad = Path.of(badparquet.toURI());
  ParquetMetadata parquetMetadataBad =
    SingletonParquetFooterCache.readFooter(localFs, filePathBad, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadataBad = parquetMetadataBad.getFileMetaData().getKeyValueMetaData();

  // should have DREMIO_ARROW_SCHEMA field, but no DREMIO_ARROW_SCHEMA_2_1
  assertTrue(metadataBad.containsKey(DREMIO_ARROW_SCHEMA));
  assertFalse(metadataBad.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  try {
    DremioArrowSchema.fromMetaData(metadataBad);
    fail("Should not be able to process arrow schema");
  } catch (Exception e) {
    // ok
  }
}

Source File: TestParquetReader.java From dremio-oss with Apache License 2.0

6 votes

@Test
public void testArrowSchema210InFooter() throws Exception {
  URL parquet210 = getClass().getResource("/dremio-region-210.parquet");
  Path filePath210 = Path.of(parquet210.toURI());
  ParquetMetadata parquetMetadata210 =
    SingletonParquetFooterCache.readFooter(localFs, filePath210, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadata210 = parquetMetadata210.getFileMetaData().getKeyValueMetaData();

  // should not have DREMIO_ARROW_SCHEMA field, but should have DREMIO_ARROW_SCHEMA_2_1
  assertFalse(metadata210.containsKey(DREMIO_ARROW_SCHEMA));
  assertTrue(metadata210.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  Schema schema210 = DremioArrowSchema.fromMetaData(metadata210);

  assertNotNull(schema210);
}

Source File: ParquetResolverTest.java From pxf with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
private List<Group> readParquetFile(String file, long expectedSize, MessageType schema) throws IOException {
    List<Group> result = new ArrayList<>();
    String parquetFile = Objects.requireNonNull(getClass().getClassLoader().getResource("parquet/" + file)).getPath();
    Path path = new Path(parquetFile);

    ParquetFileReader fileReader = new ParquetFileReader(new Configuration(), path, ParquetMetadataConverter.NO_FILTER);
    PageReadStore rowGroup;
    while ((rowGroup = fileReader.readNextRowGroup()) != null) {
        MessageColumnIO columnIO = new ColumnIOFactory().getColumnIO(schema);
        RecordReader<Group> recordReader = columnIO.getRecordReader(rowGroup, new GroupRecordConverter(schema));
        long rowCount = rowGroup.getRowCount();
        for (long i = 0; i < rowCount; i++) {
            result.add(recordReader.read());
        }
    }
    fileReader.close();
    assertEquals(expectedSize, result.size());
    return result;
}

Source File: TestParquetReader.java From dremio-oss with Apache License 2.0

6 votes

@Test
public void testArrowSchema205InFooter() throws Exception {
  URL parquet205 = getClass().getResource("/dremio-region-205.parquet");
  Path filePath = Path.of(parquet205.toURI());
  ParquetMetadata parquetMetadata =
    SingletonParquetFooterCache.readFooter(localFs, filePath, ParquetMetadataConverter.NO_FILTER,
      ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  Map<String, String> metadata = parquetMetadata.getFileMetaData().getKeyValueMetaData();

  // should have DREMIO_ARROW_SCHEMA field, but no DREMIO_ARROW_SCHEMA_2_1
  assertTrue(metadata.containsKey(DREMIO_ARROW_SCHEMA));
  assertFalse(metadata.containsKey(DREMIO_ARROW_SCHEMA_2_1));

  Schema schema = DremioArrowSchema.fromMetaData(metadata);

  assertNotNull(schema);
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException {
  long footerIndex = out.getPos();
  ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
  org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
  writeFileMetaData(parquetMetadata, out);
  LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
  BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
  out.write(MAGIC);
}

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Reads the original schema from the parquet file.
 *
 * @param parquetFile the path to the parquet file
 * @param fileSplit   the file split we are accessing
 * @return the original schema from the parquet file
 * @throws IOException when there's an IOException while reading the schema
 */
private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException {

    final long then = System.nanoTime();
    ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range(
            fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength());
    ParquetReadOptions parquetReadOptions = HadoopReadOptions
            .builder(configuration)
            .withMetadataFilter(filter)
            .build();
    HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration);
    try (ParquetFileReader parquetFileReader =
                 ParquetFileReader.open(inputFile, parquetReadOptions)) {
        FileMetaData metadata = parquetFileReader.getFileMetaData();
        if (LOG.isDebugEnabled()) {
            LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups",
                    context.getTransactionId(), context.getSegmentId(),
                    parquetFile.getName(), parquetFileReader.getRecordCount(),
                    parquetFileReader.getRowGroups().size());
        }
        final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then);
        LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(),
                context.getSegmentId(), millis);
        return metadata.getSchema();
    } catch (Exception e) {
        throw new IOException(e);
    }
}

Source File: ColumnSizeCommand.java From parquet-mr with Apache License 2.0

5 votes

public Map<String, Long> getColumnSizeInBytes(Path inputFile) throws IOException {
  Map<String, Long> colSizes = new HashMap<>();
  ParquetMetadata pmd = ParquetFileReader.readFooter(new Configuration(), inputFile, ParquetMetadataConverter.NO_FILTER);

  for (BlockMetaData block : pmd.getBlocks()) {
    for (ColumnChunkMetaData column : block.getColumns()) {
      String colName = column.getPath().toDotString();
      colSizes.put(colName, column.getTotalSize() + colSizes.getOrDefault(colName, 0L));
    }
  }

  return colSizes;
}

Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() >= 1,
      "A Parquet file is required.");
  Preconditions.checkArgument(targets.size() == 1,
      "Cannot process multiple Parquet files.");

  String source = targets.get(0);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), qualifiedPath(source), ParquetMetadataConverter.NO_FILTER);

  console.info("\nFile path:  {}", source);
  console.info("Created by: {}", footer.getFileMetaData().getCreatedBy());

  Map<String, String> kv = footer.getFileMetaData().getKeyValueMetaData();
  if (kv != null && !kv.isEmpty()) {
    console.info("Properties:");
    String format = "  %" + maxSize(kv.keySet()) + "s: %s";
    for (Map.Entry<String, String> entry : kv.entrySet()) {
      console.info(String.format(format, entry.getKey(), entry.getValue()));
    }
  } else {
    console.info("Properties: (none)");
  }

  MessageType schema = footer.getFileMetaData().getSchema();
  console.info("Schema:\n{}", schema);

  List<BlockMetaData> rowGroups = footer.getBlocks();
  for (int index = 0, n = rowGroups.size(); index < n; index += 1) {
    printRowGroup(console, index, rowGroups.get(index), schema);
  }

  console.info("");

  return 0;
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}

Source File: FileEncodingsIT.java From parquet-mr with Apache License 2.0

5 votes

private static List<PageReadStore> readBlocksFromFile(Path file) throws IOException {
  List<PageReadStore> rowGroups = new ArrayList<PageReadStore>();

  ParquetMetadata metadata = ParquetFileReader.readFooter(configuration, file, ParquetMetadataConverter.NO_FILTER);
  ParquetFileReader fileReader = new ParquetFileReader(configuration, metadata.getFileMetaData(), file, metadata.getBlocks(),
      metadata.getFileMetaData().getSchema().getColumns());

  PageReadStore group;
  while ((group = fileReader.readNextRowGroup()) != null) {
    rowGroups.add(group);
  }

  return rowGroups;
}

Source File: TestMergeMetadataFiles.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testMergeMetadataFiles() throws Exception {
  WrittenFileInfo info = writeFiles(false);

  ParquetMetadata commonMeta1 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath1, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata commonMeta2 = ParquetFileReader.readFooter(info.conf, info.commonMetaPath2, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata meta1 = ParquetFileReader.readFooter(info.conf, info.metaPath1, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata meta2 = ParquetFileReader.readFooter(info.conf, info.metaPath2, ParquetMetadataConverter.NO_FILTER);

  assertTrue(commonMeta1.getBlocks().isEmpty());
  assertTrue(commonMeta2.getBlocks().isEmpty());
  assertEquals(commonMeta1.getFileMetaData().getSchema(), commonMeta2.getFileMetaData().getSchema());

  assertFalse(meta1.getBlocks().isEmpty());
  assertFalse(meta2.getBlocks().isEmpty());
  assertEquals(meta1.getFileMetaData().getSchema(), meta2.getFileMetaData().getSchema());


  assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), commonMeta2.getFileMetaData().getKeyValueMetaData());
  assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), meta2.getFileMetaData().getKeyValueMetaData());

  // test file serialization
  Path mergedOut = new Path(new File(temp.getRoot(), "merged_meta").getAbsolutePath());
  Path mergedCommonOut = new Path(new File(temp.getRoot(), "merged_common_meta").getAbsolutePath());
  ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.metaPath1, info.metaPath2), mergedOut, info.conf);
  ParquetFileWriter.writeMergedMetadataFile(Arrays.asList(info.commonMetaPath1, info.commonMetaPath2), mergedCommonOut, info.conf);

  ParquetMetadata mergedMeta = ParquetFileReader.readFooter(info.conf, mergedOut, ParquetMetadataConverter.NO_FILTER);
  ParquetMetadata mergedCommonMeta = ParquetFileReader.readFooter(info.conf, mergedCommonOut, ParquetMetadataConverter.NO_FILTER);

  // ideally we'd assert equality here, but BlockMetaData and it's references don't implement equals
  assertEquals(meta1.getBlocks().size() + meta2.getBlocks().size(), mergedMeta.getBlocks().size());
  assertTrue(mergedCommonMeta.getBlocks().isEmpty());

  assertEquals(meta1.getFileMetaData().getSchema(), mergedMeta.getFileMetaData().getSchema());
  assertEquals(commonMeta1.getFileMetaData().getSchema(), mergedCommonMeta.getFileMetaData().getSchema());

  assertEquals(meta1.getFileMetaData().getKeyValueMetaData(), mergedMeta.getFileMetaData().getKeyValueMetaData());
  assertEquals(commonMeta1.getFileMetaData().getKeyValueMetaData(), mergedCommonMeta.getFileMetaData().getKeyValueMetaData());
}

Source File: ParquetReadOptions.java From parquet-mr with Apache License 2.0

5 votes

ParquetReadOptions(boolean useSignedStringMinMax,
                   boolean useStatsFilter,
                   boolean useDictionaryFilter,
                   boolean useRecordFilter,
                   boolean useColumnIndexFilter,
                   boolean usePageChecksumVerification,
                   boolean useBloomFilter,
                   FilterCompat.Filter recordFilter,
                   ParquetMetadataConverter.MetadataFilter metadataFilter,
                   CompressionCodecFactory codecFactory,
                   ByteBufferAllocator allocator,
                   int maxAllocationSize,
                   Map<String, String> properties) {
  this.useSignedStringMinMax = useSignedStringMinMax;
  this.useStatsFilter = useStatsFilter;
  this.useDictionaryFilter = useDictionaryFilter;
  this.useRecordFilter = useRecordFilter;
  this.useColumnIndexFilter = useColumnIndexFilter;
  this.usePageChecksumVerification = usePageChecksumVerification;
  this.useBloomFilter = useBloomFilter;
  this.recordFilter = recordFilter;
  this.metadataFilter = metadataFilter;
  this.codecFactory = codecFactory;
  this.allocator = allocator;
  this.maxAllocationSize = maxAllocationSize;
  this.properties = Collections.unmodifiableMap(properties);
}

Source File: LocalDictionariesReader.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Return dictionary per row group for all binary columns in given parquet file.
 * @param fs filesystem object.
 * @param filePath parquet file to scan
 * @return pair of dictionaries found for binary fields and list of binary fields which are not dictionary encoded.
 * @throws IOException
 */
public static Pair<Map<ColumnDescriptor, Dictionary>, Set<ColumnDescriptor>> readDictionaries(FileSystem fs, Path filePath, CompressionCodecFactory codecFactory) throws IOException {
  // Passing the max footer length is not required in this case as the parquet reader would already have failed.
  final ParquetMetadata parquetMetadata = SingletonParquetFooterCache.readFooter(fs, filePath, ParquetMetadataConverter.NO_FILTER,
    ExecConstants.PARQUET_MAX_FOOTER_LEN_VALIDATOR.getDefault().getNumVal());
  if (parquetMetadata.getBlocks().size() > 1) {
    throw new IOException(
      format("Global dictionaries can only be built on a parquet file with a single row group, found %d row groups for file %s",
        parquetMetadata.getBlocks().size(), filePath));
  }
  final BlockMetaData rowGroupMetadata = parquetMetadata.getBlocks().get(0);
  final Map<ColumnPath, ColumnDescriptor> columnDescriptorMap = Maps.newHashMap();

  for (ColumnDescriptor columnDescriptor : parquetMetadata.getFileMetaData().getSchema().getColumns()) {
    columnDescriptorMap.put(ColumnPath.get(columnDescriptor.getPath()), columnDescriptor);
  }

  final Set<ColumnDescriptor> columnsToSkip = Sets.newHashSet(); // columns which are found in parquet file but are not dictionary encoded
  final Map<ColumnDescriptor, Dictionary> dictionaries = Maps.newHashMap();
  try(final FSInputStream in = fs.open(filePath)) {
    for (ColumnChunkMetaData columnChunkMetaData : rowGroupMetadata.getColumns()) {
      if (isBinaryType(columnChunkMetaData.getType())) {
        final ColumnDescriptor column = columnDescriptorMap.get(columnChunkMetaData.getPath());
        // if first page is dictionary encoded then load dictionary, otherwise skip this column.
        final PageHeaderWithOffset pageHeader = columnChunkMetaData.getPageHeaders().get(0);
        if (PageType.DICTIONARY_PAGE == pageHeader.getPageHeader().getType()) {
          dictionaries.put(column, readDictionary(in, column, pageHeader, codecFactory.getDecompressor(columnChunkMetaData.getCodec())));
        } else {
          columnsToSkip.add(column);
        }
      }
    }
  }
  return new ImmutablePair<>(dictionaries, columnsToSkip);
}

Source File: TableSchemaResolver.java From hudi with Apache License 2.0

5 votes

/**
 * Read the parquet schema from a parquet File.
 */
public MessageType readSchemaFromBaseFile(Path parquetFilePath) throws IOException {
  LOG.info("Reading schema from " + parquetFilePath);

  FileSystem fs = metaClient.getRawFs();
  if (!fs.exists(parquetFilePath)) {
    throw new IllegalArgumentException(
        "Failed to read schema from data file " + parquetFilePath + ". File does not exist.");
  }
  ParquetMetadata fileFooter =
      ParquetFileReader.readFooter(fs.getConf(), parquetFilePath, ParquetMetadataConverter.NO_FILTER);
  return fileFooter.getFileMetaData().getSchema();
}

org.apache.parquet.format.converter.ParquetMetadataConverter Java Examples