org.apache.parquet.hadoop.metadata.FileMetaData Java Exaples

Source File: HiveProtoParquetWriterWithOffsetTest.java From garmadon with Apache License 2.0

6 votes

@Before
public void setup() throws IOException {
    protoParquetWriterWithOffset = mock(ProtoParquetWriterWithOffset.class);
    hiveClient = mock(HiveClient.class);

    when(protoParquetWriterWithOffset.getEventName()).thenReturn(eventName);
    when(protoParquetWriterWithOffset.getFinalHdfsDir()).thenReturn(finalPath);
    ProtoParquetWriter<Message> writerMock = mock(ProtoParquetWriter.class);
    when(protoParquetWriterWithOffset.getWriter()).thenReturn(writerMock);
    ParquetMetadata parquetMetadata = mock(ParquetMetadata.class);
    when(writerMock.getFooter()).thenReturn(parquetMetadata);

    PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id");
    schema = new MessageType("fs", appId);
    FileMetaData fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "test");
    when(parquetMetadata.getFileMetaData()).thenReturn(fileMetaData);

    when(protoParquetWriterWithOffset.getDayStartTime()).thenReturn(LocalDateTime.of(2019, 9, 10, 10, 10, 10));
}

Source File: ParquetHdfsFileSink.java From components with Apache License 2.0

6 votes

@Override
protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException {
    FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
    List<Path> sourceFiles = new ArrayList<>();
    for (FileStatus sourceStatus : sourceStatuses) {
        sourceFiles.add(sourceStatus.getPath());
    }
    FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData();
    ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile),
            ParquetFileWriter.Mode.CREATE);
    writer.start();
    for (Path input : sourceFiles) {
        writer.appendFile(fs.getConf(), input);
    }
    writer.end(mergedMeta.getKeyValueMetaData());
}

Source File: InternalParquetRecordReader.java From tajo with Apache License 2.0

6 votes

public void initialize(FileMetaData parquetFileMetadata,
                       Path file, List<BlockMetaData> blocks, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.fileSchema = parquetFileMetadata.getSchema();
  this.file = file;
  this.columnCount = requestedSchema.getPaths().size();
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  List<ColumnDescriptor> columns = requestedSchema.getColumns();
  reader = new ParquetFileReader(configuration, parquetFileMetadata, file, blocks, columns);
  for (BlockMetaData block : blocks) {
    total += block.getRowCount();
  }
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  LOG.info("RecordReader initialized will read a total of " + total + " records.");
}

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

6 votes

public static void showDetails(PrettyPrintWriter out, FileMetaData meta) {
  out.format("creator: %s%n", meta.getCreatedBy());

  Map<String,String> extra = meta.getKeyValueMetaData();
  if (extra != null) {
    for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) {
      out.print("extra: ");
      out.incrementTabLevel();
      out.format("%s = %s%n", entry.getKey(), entry.getValue());
      out.decrementTabLevel();
    }
  }

  out.println();
  out.format("file schema: %s%n", meta.getSchema().getName());
  out.rule('-');
  showDetails(out, meta.getSchema());
}

Source File: MetadataUtils.java From parquet-mr with Apache License 2.0

6 votes

static void showDetails(PrettyPrintWriter out, FileMetaData meta, boolean showOriginalTypes) {
  out.format("creator: %s%n", meta.getCreatedBy());

  Map<String,String> extra = meta.getKeyValueMetaData();
  if (extra != null) {
    for (Map.Entry<String,String> entry : meta.getKeyValueMetaData().entrySet()) {
      out.print("extra: ");
      out.incrementTabLevel();
      out.format("%s = %s%n", entry.getKey(), entry.getValue());
      out.decrementTabLevel();
    }
  }

  out.println();
  out.format("file schema: %s%n", meta.getSchema().getName());
  out.rule('-');
  showDetails(out, meta.getSchema(), showOriginalTypes);
}

Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0

6 votes

private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
  // splitting files?
  if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
    // this is okay if not using DELTA_BYTE_ARRAY with the bug
    Set<Encoding> encodings = new HashSet<Encoding>();
    for (ColumnChunkMetaData column : block.getColumns()) {
      encodings.addAll(column.getEncodings());
    }
    for (Encoding encoding : encodings) {
      if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
        throw new ParquetDecodingException("Cannot read data due to " +
            "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
      }
    }
  }
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

6 votes

/**
 * Given a list of metadata files, merge them into a single ParquetMetadata
 * Requires that the schemas be compatible, and the extraMetadata be exactly equal.
 * @param files a list of files to merge metadata from
 * @param conf a configuration
 * @return merged parquet metadata for the files
 * @throws IOException if there is an error while writing
 * @deprecated metadata files are not recommended and will be removed in 2.0.0
 */
@Deprecated
public static ParquetMetadata mergeMetadataFiles(List<Path> files,  Configuration conf) throws IOException {
  Preconditions.checkArgument(!files.isEmpty(), "Cannot merge an empty list of metadata");

  GlobalMetaData globalMetaData = null;
  List<BlockMetaData> blocks = new ArrayList<BlockMetaData>();

  for (Path p : files) {
    ParquetMetadata pmd = ParquetFileReader.readFooter(conf, p, ParquetMetadataConverter.NO_FILTER);
    FileMetaData fmd = pmd.getFileMetaData();
    globalMetaData = mergeInto(fmd, globalMetaData, true);
    blocks.addAll(pmd.getBlocks());
  }

  // collapse GlobalMetaData into a single FileMetaData, which will throw if they are not compatible
  return new ParquetMetadata(globalMetaData.merge(), blocks);
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

6 votes

public void initialize(ParquetFileReader reader, Configuration configuration)
    throws IOException {
  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
      configuration, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(
      configuration, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(configuration, total);
  this.filterRecords = configuration.getBoolean(RECORD_FILTERING_ENABLED, true);
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param configuration the Hadoop conf
 * @param fileMetaData fileMetaData for parquet file
 * @param filePath Path for the parquet file
 * @param blocks the blocks to read
 * @param columns the columns to read (their path)
 * @throws IOException if the file can not be opened
 * @deprecated will be removed in 2.0.0.
 */
@Deprecated
public ParquetFileReader(
    Configuration configuration, FileMetaData fileMetaData,
    Path filePath, List<BlockMetaData> blocks, List<ColumnDescriptor> columns) throws IOException {
  this.converter = new ParquetMetadataConverter(configuration);
  this.file = HadoopInputFile.fromPath(filePath, configuration);
  this.fileMetaData = fileMetaData;
  this.f = file.newStream();
  this.options = HadoopReadOptions.builder(configuration).build();
  this.blocks = filterRowGroups(blocks);
  this.blockIndexStores = listWithNulls(this.blocks.size());
  this.blockRowRanges = listWithNulls(this.blocks.size());
  for (ColumnDescriptor col : columns) {
    paths.put(ColumnPath.get(col.getPath()), col);
  }
  this.crc = options.usePageChecksumVerification() ? new CRC32() : null;
}

Source File: CheckParquet251Command.java From parquet-mr with Apache License 2.0

5 votes

private String check(String file) throws IOException {
  Path path = qualifiedPath(file);
  ParquetMetadata footer = ParquetFileReader.readFooter(
      getConf(), path, ParquetMetadataConverter.NO_FILTER);

  FileMetaData meta = footer.getFileMetaData();
  String createdBy = meta.getCreatedBy();
  if (CorruptStatistics.shouldIgnoreStatistics(createdBy, BINARY)) {
    // create fake metadata that will read corrupt stats and return them
    FileMetaData fakeMeta = new FileMetaData(
        meta.getSchema(), meta.getKeyValueMetaData(), Version.FULL_VERSION);

    // get just the binary columns
    List<ColumnDescriptor> columns = Lists.newArrayList();
    Iterables.addAll(columns, Iterables.filter(
        meta.getSchema().getColumns(),
        new Predicate<ColumnDescriptor>() {
          @Override
          public boolean apply(@Nullable ColumnDescriptor input) {
            return input != null && input.getType() == BINARY;
          }
        }));

    // now check to see if the data is actually corrupt
    ParquetFileReader reader = new ParquetFileReader(getConf(),
        fakeMeta, path, footer.getBlocks(), columns);

    try {
      PageStatsValidator validator = new PageStatsValidator();
      for (PageReadStore pages = reader.readNextRowGroup(); pages != null;
           pages = reader.readNextRowGroup()) {
        validator.validate(columns, pages);
      }
    } catch (BadStatsException e) {
      return e.getMessage();
    }
  }

  return null;
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}

Source File: ParquetRecordReaderWrapper.java From parquet-mr with Apache License 2.0

5 votes

/**
 * gets a ParquetInputSplit corresponding to a split given by Hive
 *
 * @param oldSplit The split given by Hive
 * @param conf The JobConf of the Hive job
 * @return a ParquetInputSplit corresponding to the oldSplit
 * @throws IOException if the config cannot be enhanced or if the footer cannot be read from the file
 */
protected ParquetInputSplit getSplit(
    final InputSplit oldSplit,
    final JobConf conf
    ) throws IOException {
  if (oldSplit instanceof FileSplit) {
    FileSplit fileSplit = (FileSplit) oldSplit;
    final long splitStart = fileSplit.getStart();
    final long splitLength = fileSplit.getLength();
    final Path finalPath = fileSplit.getPath();
    final JobConf cloneJob = hiveBinding.pushProjectionsAndFilters(conf, finalPath.getParent());

    final ParquetMetadata parquetMetadata = ParquetFileReader.readFooter(cloneJob, finalPath, SKIP_ROW_GROUPS);
    final FileMetaData fileMetaData = parquetMetadata.getFileMetaData();
    final ReadContext readContext =
        new DataWritableReadSupport()
          .init(cloneJob, fileMetaData.getKeyValueMetaData(), fileMetaData.getSchema());

    schemaSize = MessageTypeParser.parseMessageType(
          readContext.getReadSupportMetadata().get(DataWritableReadSupport.HIVE_SCHEMA_KEY)
        ).getFieldCount();
     return new ParquetInputSplit(
              finalPath,
              splitStart,
              splitStart + splitLength,
              splitLength,
              fileSplit.getLocations(),
              null);
  } else {
    throw new IllegalArgumentException("Unknown split type: " + oldSplit);
  }
}

Source File: TestInputFormat.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void setUp() {
  blocks = new ArrayList<BlockMetaData>();
  for (int i = 0; i < 10; i++) {
    blocks.add(newBlock(i * 10, 10));
  }
  schema = MessageTypeParser.parseMessageType("message doc { required binary foo; }");
  fileMetaData = new FileMetaData(schema, new HashMap<String, String>(), "parquet-mr");
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

static GlobalMetaData mergeInto(
    FileMetaData toMerge,
    GlobalMetaData mergedMetadata,
    boolean strict) {
  MessageType schema = null;
  Map<String, Set<String>> newKeyValues = new HashMap<String, Set<String>>();
  Set<String> createdBy = new HashSet<String>();
  if (mergedMetadata != null) {
    schema = mergedMetadata.getSchema();
    newKeyValues.putAll(mergedMetadata.getKeyValueMetaData());
    createdBy.addAll(mergedMetadata.getCreatedBy());
  }
  if ((schema == null && toMerge.getSchema() != null)
      || (schema != null && !schema.equals(toMerge.getSchema()))) {
    schema = mergeInto(toMerge.getSchema(), schema, strict);
  }
  for (Entry<String, String> entry : toMerge.getKeyValueMetaData().entrySet()) {
    Set<String> values = newKeyValues.get(entry.getKey());
    if (values == null) {
      values = new LinkedHashSet<String>();
      newKeyValues.put(entry.getKey(), values);
    }
    values.add(entry.getValue());
  }
  createdBy.add(toMerge.getCreatedBy());
  return new GlobalMetaData(
      schema,
      newKeyValues,
      createdBy);
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

private static void serializeFooter(ParquetMetadata footer, PositionOutputStream out) throws IOException {
  long footerIndex = out.getPos();
  ParquetMetadataConverter metadataConverter = new ParquetMetadataConverter();
  org.apache.parquet.format.FileMetaData parquetMetadata = metadataConverter.toParquetMetadata(CURRENT_VERSION, footer);
  writeFileMetaData(parquetMetadata, out);
  LOG.debug("{}: footer length = {}" , out.getPos(), (out.getPos() - footerIndex));
  BytesUtils.writeIntLittleEndian(out, (int) (out.getPos() - footerIndex));
  out.write(MAGIC);
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

5 votes

/**
 * ends a file once all blocks have been written.
 * closes the file.
 * @param extraMetaData the extra meta data to write in the footer
 * @throws IOException if there is an error while writing
 */
public void end(Map<String, String> extraMetaData) throws IOException {
  state = state.end();
  serializeColumnIndexes(columnIndexes, blocks, out);
  serializeOffsetIndexes(offsetIndexes, blocks, out);
  serializeBloomFilters(bloomFilters, blocks, out);
  LOG.debug("{}: end", out.getPos());
  this.footer = new ParquetMetadata(new FileMetaData(schema, extraMetaData, Version.FULL_VERSION), blocks);
  serializeFooter(footer, out);
  out.close();
}

Source File: InternalParquetRecordReader.java From parquet-mr with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, ParquetReadOptions options) {
  // copy custom configuration to the Configuration passed to the ReadSupport
  Configuration conf = new Configuration();
  if (options instanceof HadoopReadOptions) {
    conf = ((HadoopReadOptions) options).getConf();
  }
  for (String property : options.getPropertyNames()) {
    conf.set(property, options.getProperty(property));
  }

  // initialize a ReadContext for this file
  this.reader = reader;
  FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
  this.fileSchema = parquetFileMetadata.getSchema();
  Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
  ReadSupport.ReadContext readContext = readSupport.init(new InitContext(conf, toSetMultiMap(fileMetadata), fileSchema));
  this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
  this.requestedSchema = readContext.getRequestedSchema();
  this.columnCount = requestedSchema.getPaths().size();
  // Setting the projection schema before running any filtering (e.g. getting filtered record count)
  // because projection impacts filtering
  reader.setRequestedSchema(requestedSchema);
  this.recordConverter = readSupport.prepareForRead(conf, fileMetadata, fileSchema, readContext);
  this.strictTypeChecking = options.isEnabled(STRICT_TYPE_CHECKING, true);
  this.total = reader.getFilteredRecordCount();
  this.unmaterializableRecordCounter = new UnmaterializableRecordCounter(options, total);
  this.filterRecords = options.useRecordFilter();
  LOG.info("RecordReader initialized will read a total of {} records.", total);
}

Source File: MergeCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  // Prepare arguments
  List<String> args = options.getArgList();
  List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
  Path outputFile = new Path(args.get(args.size() - 1));

  // Merge schema and extraMeta
  FileMetaData mergedMeta = mergedMetadata(inputFiles);
  PrintWriter out = new PrintWriter(Main.out, true);

  // Merge data
  ParquetFileWriter writer = new ParquetFileWriter(conf,
          mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
  writer.start();
  boolean tooSmallFilesMerged = false;
  for (Path input: inputFiles) {
    if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
      out.format("Warning: file %s is too small, length: %d\n",
        input,
        input.getFileSystem(conf).getFileStatus(input).getLen());
      tooSmallFilesMerged = true;
    }

    writer.appendFile(HadoopInputFile.fromPath(input, conf));
  }

  if (tooSmallFilesMerged) {
    out.println("Warning: you merged too small files. " +
      "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " +
      "which usually leads to bad query performance!");
  }
  writer.end(mergedMeta.getKeyValueMetaData());
}

Source File: PruneColumnsCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  List<String> args = options.getArgList();
  Path inputFile = new Path(args.get(0));
  Path outputFile = new Path(args.get(1));
  List<String> cols = args.subList(2, args.size());

  Set<ColumnPath> prunePaths = convertToColumnPaths(cols);

  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, inputFile, ParquetMetadataConverter.NO_FILTER);
  FileMetaData metaData = pmd.getFileMetaData();
  MessageType schema = metaData.getSchema();
  List<String> paths = new ArrayList<>();
  getPaths(schema, paths, null);

  for (String col : cols) {
    if (!paths.contains(col)) {
      LOG.warn("Input column name {} doesn't show up in the schema of file {}", col, inputFile.getName());
    }
  }

  ParquetFileWriter writer = new ParquetFileWriter(conf,
    pruneColumnsInSchema(schema, prunePaths), outputFile, ParquetFileWriter.Mode.CREATE);

  writer.start();
  writer.appendFile(HadoopInputFile.fromPath(inputFile, conf));
  writer.end(metaData.getKeyValueMetaData());
}

Source File: ParquetRecordReader.java From flink with Apache License 2.0

5 votes

public void initialize(ParquetFileReader reader, Configuration configuration) {
	this.reader = reader;
	FileMetaData parquetFileMetadata = reader.getFooter().getFileMetaData();
	// real schema of parquet file
	this.fileSchema = parquetFileMetadata.getSchema();
	Map<String, String> fileMetadata = parquetFileMetadata.getKeyValueMetaData();
	ReadSupport.ReadContext readContext = readSupport.init(new InitContext(
		configuration, toSetMultiMap(fileMetadata), readSchema));

	this.columnIOFactory = new ColumnIOFactory(parquetFileMetadata.getCreatedBy());
	this.recordMaterializer = readSupport.prepareForRead(
		configuration, fileMetadata, readSchema, readContext);
	this.numTotalRecords = reader.getRecordCount();
}

Source File: ParquetFileAccessor.java From pxf with Apache License 2.0

5 votes

/**
 * Reads the original schema from the parquet file.
 *
 * @param parquetFile the path to the parquet file
 * @param fileSplit   the file split we are accessing
 * @return the original schema from the parquet file
 * @throws IOException when there's an IOException while reading the schema
 */
private MessageType getSchema(Path parquetFile, FileSplit fileSplit) throws IOException {

    final long then = System.nanoTime();
    ParquetMetadataConverter.MetadataFilter filter = ParquetMetadataConverter.range(
            fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength());
    ParquetReadOptions parquetReadOptions = HadoopReadOptions
            .builder(configuration)
            .withMetadataFilter(filter)
            .build();
    HadoopInputFile inputFile = HadoopInputFile.fromPath(parquetFile, configuration);
    try (ParquetFileReader parquetFileReader =
                 ParquetFileReader.open(inputFile, parquetReadOptions)) {
        FileMetaData metadata = parquetFileReader.getFileMetaData();
        if (LOG.isDebugEnabled()) {
            LOG.debug("{}-{}: Reading file {} with {} records in {} RowGroups",
                    context.getTransactionId(), context.getSegmentId(),
                    parquetFile.getName(), parquetFileReader.getRecordCount(),
                    parquetFileReader.getRowGroups().size());
        }
        final long millis = TimeUnit.NANOSECONDS.toMillis(System.nanoTime() - then);
        LOG.debug("{}-{}: Read schema in {} ms", context.getTransactionId(),
                context.getSegmentId(), millis);
        return metadata.getSchema();
    } catch (Exception e) {
        throw new IOException(e);
    }
}

Source File: ParquetFileReader.java From parquet-mr with Apache License 2.0

4 votes

public FileMetaData getFileMetaData() {
  if (fileMetaData != null) {
    return fileMetaData;
  }
  return getFooter().getFileMetaData();
}

Source File: ColumnIndexValidator.java From parquet-mr with Apache License 2.0

4 votes

public static List<ContractViolation> checkContractViolations(InputFile file) throws IOException {
  List<ContractViolation> violations = new ArrayList<>();
  try (ParquetFileReader reader = ParquetFileReader.open(file)) {
    FileMetaData meta = reader.getFooter().getFileMetaData();
    MessageType schema = meta.getSchema();
    List<ColumnDescriptor> columns = schema.getColumns();

    List<BlockMetaData> blocks = reader.getFooter().getBlocks();
    int rowGroupNumber = 0;
    PageReadStore rowGroup = reader.readNextRowGroup();
    while (rowGroup != null) {
      ColumnReadStore columnReadStore = new ColumnReadStoreImpl(rowGroup,
          new DummyRecordConverter(schema).getRootConverter(), schema, null);
      List<ColumnChunkMetaData> columnChunks = blocks.get(rowGroupNumber).getColumns();
      assert (columnChunks.size() == columns.size());
      for (int columnNumber = 0; columnNumber < columns.size(); ++columnNumber) {
        ColumnDescriptor column = columns.get(columnNumber);
        ColumnChunkMetaData columnChunk = columnChunks.get(columnNumber);
        ColumnIndex columnIndex = reader.readColumnIndex(columnChunk);
        if (columnIndex == null) {
          continue;
        }
        ColumnPath columnPath = columnChunk.getPath();
        OffsetIndex offsetIndex = reader.readOffsetIndex(columnChunk);
        List<ByteBuffer> minValues = columnIndex.getMinValues();
        List<ByteBuffer> maxValues = columnIndex.getMaxValues();
        BoundaryOrder boundaryOrder = columnIndex.getBoundaryOrder();
        List<Long> nullCounts = columnIndex.getNullCounts();
        List<Boolean> nullPages = columnIndex.getNullPages();
        long rowNumber = 0;
        ColumnReader columnReader = columnReadStore.getColumnReader(column);
        ByteBuffer prevMinValue = null;
        ByteBuffer prevMaxValue = null;
        for (int pageNumber = 0; pageNumber < offsetIndex.getPageCount(); ++pageNumber) {
          boolean isNullPage = nullPages.get(pageNumber);
          ByteBuffer minValue = minValues.get(pageNumber);
          ByteBuffer maxValue = maxValues.get(pageNumber);
          PageValidator pageValidator = new PageValidator(
              column.getPrimitiveType(),
              rowGroupNumber, columnNumber, columnPath, pageNumber,
              violations, columnReader,
              minValue,
              maxValue,
              prevMinValue,
              prevMaxValue,
              boundaryOrder,
              nullCounts.get(pageNumber),
              isNullPage);
          if (!isNullPage) {
            prevMinValue = minValue;
            prevMaxValue = maxValue;
          }
          long lastRowNumberInPage = offsetIndex.getLastRowIndex(pageNumber, rowGroup.getRowCount());
          while (rowNumber <= lastRowNumberInPage) {
            pageValidator.validateValuesBelongingToRow();
            ++rowNumber;
          }
          pageValidator.finishPage();
        }
      }
      rowGroup = reader.readNextRowGroup();
      rowGroupNumber++;
    }
  }
  return violations;
}

Source File: MergeCommand.java From parquet-mr with Apache License 2.0

4 votes

private FileMetaData mergedMetadata(List<Path> inputFiles) throws IOException {
  return ParquetFileWriter.mergeMetadataFiles(inputFiles, conf).getFileMetaData();
}

Source File: ParquetFileWriter.java From parquet-mr with Apache License 2.0

2 votes

/**
 * Will return the result of merging toMerge into mergedMetadata
 * @param toMerge the metadata toMerge
 * @param mergedMetadata the reference metadata to merge into
 * @return the result of the merge
 */
static GlobalMetaData mergeInto(
    FileMetaData toMerge,
    GlobalMetaData mergedMetadata) {
  return mergeInto(toMerge, mergedMetadata, true);
}

org.apache.parquet.hadoop.metadata.FileMetaData Java Examples