org.apache.parquet.hadoop.metadata.BlockMetaData#getTotalByteSize

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

5 votes

private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

5 votes

private static void showDetails(PrettyPrintWriter out, BlockMetaData meta, Long num) {
  long rows = meta.getRowCount();
  long tbs = meta.getTotalByteSize();
  long offset = meta.getStartingPos();

  out.format("row group%s: RC:%d TS:%d OFFSET:%d%n", (num == null ? "" : " " + num), rows, tbs, offset);
  out.rule('-');
  showDetails(out, meta.getColumns());
}

Source File: SizeCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);

  String[] args = options.getArgs();
  String input = args[0];
  out = new PrintWriter(Main.out, true);
  inputPath = new Path(input);
  conf = new Configuration();
  inputFileStatuses = inputPath.getFileSystem(conf).globStatus(inputPath);
  long size = 0;
  for (FileStatus fs : inputFileStatuses) {
    long fileSize = 0;
    for (Footer f : ParquetFileReader.readFooters(conf, fs, false)) {
      for (BlockMetaData b : f.getParquetMetadata().getBlocks()) {
        size += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
        fileSize += (options.hasOption('u') ? b.getTotalByteSize() : b.getCompressedSize());
      }
    }
    if (options.hasOption('d')) {
      if (options.hasOption('p')) {
        out.format("%s: %s\n", fs.getPath().getName(), getPrettySize(fileSize));
      }
      else {
        out.format("%s: %d bytes\n", fs.getPath().getName(), fileSize);
      }
    }
  }

  if (options.hasOption('p')) {
    out.format("Total Size: %s", getPrettySize(size));
  }
  else {
    out.format("Total Size: %d bytes", size);
  }
  out.println();
}

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

5 votes

public ParquetInputSplit getParquetInputSplit(FileStatus fileStatus, String requestedSchema, Map<String, String> readSupportMetadata) throws IOException {
  MessageType requested = MessageTypeParser.parseMessageType(requestedSchema);
  long length = 0;

  for (BlockMetaData block : this.getRowGroups()) {
    List<ColumnChunkMetaData> columns = block.getColumns();
    for (ColumnChunkMetaData column : columns) {
      if (requested.containsPath(column.getPath().toArray())) {
        length += column.getTotalSize();
      }
    }
  }

  BlockMetaData lastRowGroup = this.getRowGroups().get(this.getRowGroupCount() - 1);
  long end = lastRowGroup.getStartingPos() + lastRowGroup.getTotalByteSize();

  long[] rowGroupOffsets = new long[this.getRowGroupCount()];
  for (int i = 0; i < rowGroupOffsets.length; i++) {
    rowGroupOffsets[i] = this.getRowGroups().get(i).getStartingPos();
  }

  return new ParquetInputSplit(
          fileStatus.getPath(),
          hdfsBlock.getOffset(),
          end,
          length,
          hdfsBlock.getHosts(),
          rowGroupOffsets
  );
}

Source File: TestParquetWriterAppendBlocks.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testMergedMetadata() throws IOException {
  Path combinedFile = newTemp();
  ParquetFileWriter writer = new ParquetFileWriter(
      CONF, FILE_SCHEMA, combinedFile);
  writer.start();
  writer.appendFile(CONF, file1);
  writer.appendFile(CONF, file2);
  writer.end(EMPTY_METADATA);

  ParquetMetadata combinedFooter = ParquetFileReader.readFooter(
      CONF, combinedFile, NO_FILTER);
  ParquetMetadata f1Footer = ParquetFileReader.readFooter(
      CONF, file1, NO_FILTER);
  ParquetMetadata f2Footer = ParquetFileReader.readFooter(
      CONF, file2, NO_FILTER);

  LinkedList<BlockMetaData> expectedRowGroups = new LinkedList<BlockMetaData>();
  expectedRowGroups.addAll(f1Footer.getBlocks());
  expectedRowGroups.addAll(f2Footer.getBlocks());

  Assert.assertEquals("Combined should have the right number of row groups",
      expectedRowGroups.size(),
      combinedFooter.getBlocks().size());

  long nextStart = 4;
  for (BlockMetaData rowGroup : combinedFooter.getBlocks()) {
    BlockMetaData expected = expectedRowGroups.removeFirst();
    Assert.assertEquals("Row count should match",
        expected.getRowCount(), rowGroup.getRowCount());
    Assert.assertEquals("Compressed size should match",
        expected.getCompressedSize(), rowGroup.getCompressedSize());
    Assert.assertEquals("Total size should match",
        expected.getTotalByteSize(), rowGroup.getTotalByteSize());
    Assert.assertEquals("Start pos should be at the last row group's end",
        nextStart, rowGroup.getStartingPos());
    assertColumnsEquivalent(expected.getColumns(), rowGroup.getColumns());
    nextStart = rowGroup.getStartingPos() + rowGroup.getTotalByteSize();
  }
}

Source File: ParquetMetadataCommand.java From parquet-mr with Apache License 2.0

5 votes

private void printRowGroup(Logger console, int index, BlockMetaData rowGroup, MessageType schema) {
  long start = rowGroup.getStartingPos();
  long rowCount = rowGroup.getRowCount();
  long compressedSize = rowGroup.getCompressedSize();
  long uncompressedSize = rowGroup.getTotalByteSize();
  String filePath = rowGroup.getPath();

  console.info(String.format("\nRow group %d:  count: %d  %s records  start: %d  total: %s%s\n%s",
      index, rowCount,
      humanReadable(((float) compressedSize) / rowCount),
      start, humanReadable(compressedSize),
      filePath != null ? " path: " + filePath : "",
      new TextStringBuilder(80).appendPadding(80, '-')));

  int size = maxSize(Iterables.transform(rowGroup.getColumns(),
      new Function<ColumnChunkMetaData, String>() {
        @Override
        public String apply(@Nullable ColumnChunkMetaData input) {
          return input == null ? "" : input.getPath().toDotString();
        }
      }));

  console.info(String.format("%-" + size + "s  %-9s %-9s %-9s %-10s %-7s %s",
      "", "type", "encodings", "count", "avg size", "nulls", "min / max"));
  for (ColumnChunkMetaData column : rowGroup.getColumns()) {
    printColumnChunk(console, size, column, schema);
  }
}

Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0

4 votes

private void addRowGroup(ParquetMetadata parquetMetadata, List<RowGroup> rowGroups, BlockMetaData block) {
    //rowGroup.total_byte_size = ;
    List<ColumnChunkMetaData> columns = block.getColumns();
    List<ColumnChunk> parquetColumns = new ArrayList<ColumnChunk>();
    for (ColumnChunkMetaData columnMetaData : columns) {
      ColumnChunk columnChunk = new ColumnChunk(columnMetaData.getFirstDataPageOffset()); // verify this is the right offset
      columnChunk.file_path = block.getPath(); // they are in the same file for now
      columnChunk.meta_data = new ColumnMetaData(
          getType(columnMetaData.getType()),
          toFormatEncodings(columnMetaData.getEncodings()),
          Arrays.asList(columnMetaData.getPath().toArray()),
          toFormatCodec(columnMetaData.getCodec()),
          columnMetaData.getValueCount(),
          columnMetaData.getTotalUncompressedSize(),
          columnMetaData.getTotalSize(),
          columnMetaData.getFirstDataPageOffset());
      if (columnMetaData.getEncodingStats() != null && columnMetaData.getEncodingStats().hasDictionaryPages()) {
        columnChunk.meta_data.setDictionary_page_offset(columnMetaData.getDictionaryPageOffset());
      }
      columnChunk.meta_data.setBloom_filter_offset(columnMetaData.getBloomFilterOffset());
      if (!columnMetaData.getStatistics().isEmpty()) {
        columnChunk.meta_data.setStatistics(toParquetStatistics(columnMetaData.getStatistics(), this.statisticsTruncateLength));
      }
      if (columnMetaData.getEncodingStats() != null) {
        columnChunk.meta_data.setEncoding_stats(convertEncodingStats(columnMetaData.getEncodingStats()));
      }
//      columnChunk.meta_data.index_page_offset = ;
//      columnChunk.meta_data.key_value_metadata = ; // nothing yet

      IndexReference columnIndexRef = columnMetaData.getColumnIndexReference();
      if (columnIndexRef != null) {
        columnChunk.setColumn_index_offset(columnIndexRef.getOffset());
        columnChunk.setColumn_index_length(columnIndexRef.getLength());
      }
      IndexReference offsetIndexRef = columnMetaData.getOffsetIndexReference();
      if (offsetIndexRef != null) {
        columnChunk.setOffset_index_offset(offsetIndexRef.getOffset());
        columnChunk.setOffset_index_length(offsetIndexRef.getLength());
      }

      parquetColumns.add(columnChunk);
    }
    RowGroup rowGroup = new RowGroup(parquetColumns, block.getTotalByteSize(), block.getRowCount());
    rowGroups.add(rowGroup);
  }

Java Code Examples for org.apache.parquet.hadoop.metadata.BlockMetaData#getTotalByteSize()