org.apache.parquet.hadoop.ParquetFileWriter Java Exaples

Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0

7 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(input != null && output != null,
    "Both input and output parquet file paths are required.");

  Preconditions.checkArgument(codec != null,
    "The codec cannot be null");

  Path inPath = new Path(input);
  Path outPath = new Path(output);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(getConf(), inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(getConf(), schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, getConf()), HadoopReadOptions.builder(getConf()).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
  return 0;
}

Source File: ParquetReaderUtility.java From Bats with Apache License 2.0

6 votes

/**
 * Map full schema paths in format `a`.`b`.`c` to respective SchemaElement objects.
 *
 * @param footer Parquet file metadata
 * @return       schema full path to SchemaElement map
 */
public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  Map<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);

  Iterator<SchemaElement> iter = fileMetaData.getSchema().iterator();

  // First element in collection is default `root` element. We skip it to maintain key in `a` format instead of `root`.`a`,
  // and thus to avoid the need to cut it out again when comparing with SchemaPath string representation
  if (iter.hasNext()) {
    iter.next();
  }
  while (iter.hasNext()) {
    addSchemaElementMapping(iter, new StringBuilder(), schemaElements);
  }
  return schemaElements;
}

Source File: ParquetFormatPlugin.java From Bats with Apache License 2.0

6 votes

boolean isDirReadable(DrillFileSystem fs, FileStatus dir) {
  Path p = new Path(dir.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
  try {
    if (fs.exists(p)) {
      return true;
    } else {

      if (metaDataFileExists(fs, dir)) {
        return true;
      }
      List<FileStatus> statuses = DrillFileSystemUtil.listFiles(fs, dir.getPath(), false);
      return !statuses.isEmpty() && super.isFileReadable(fs, statuses.get(0));
    }
  } catch (IOException e) {
    logger.info("Failure while attempting to check for Parquet metadata file.", e);
    return false;
  }
}

Source File: ConvertCsvToParquetFileExpressionProcessor.java From vividus with Apache License 2.0

6 votes

private void write(File file, String avroSchemaPath, List<Map<String, String>> data) throws IOException
{
    Schema schema = new Parser().parse(ResourceUtils.loadResource(avroSchemaPath));
    try (ParquetWriter<GenericRecord> writer = AvroParquetWriter
            .<GenericRecord>builder(new Path(file.toURI()))
            .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
            .withDataModel(GenericData.get())
            .withSchema(schema)
            .build())
    {
        for (Map<String, String> map : data)
        {
            GenericRecord record = new GenericData.Record(schema);
            map.forEach(record::put);
            writer.write(record);
        }
    }
}

Source File: CompressionConveterTest.java From parquet-mr with Apache License 2.0

6 votes

private void convertCompression(Configuration conf, String inputFile, String outputFile, String codec) throws IOException {
  Path inPath = new Path(inputFile);
  Path outPath = new Path(outputFile);
  CompressionCodecName codecName = CompressionCodecName.valueOf(codec);

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

6 votes

public void processBlocks(TransParquetFileReader reader, ParquetFileWriter writer, ParquetMetadata meta, MessageType schema,
                           String createdBy, CompressionCodecName codecName) throws IOException {
  int blockIndex = 0;
  PageReadStore store = reader.readNextRowGroup();
  while (store != null) {
    writer.startBlock(store.getRowCount());
    BlockMetaData blockMetaData = meta.getBlocks().get(blockIndex);
    List<ColumnChunkMetaData> columnsInOrder = blockMetaData.getColumns();
    Map<ColumnPath, ColumnDescriptor> descriptorsMap = schema.getColumns().stream().collect(
      Collectors.toMap(x -> ColumnPath.get(x.getPath()), x -> x));
    for (int i = 0; i < columnsInOrder.size(); i += 1) {
      ColumnChunkMetaData chunk = columnsInOrder.get(i);
      ColumnReadStoreImpl crstore = new ColumnReadStoreImpl(store, new DummyGroupConverter(), schema, createdBy);
      ColumnDescriptor columnDescriptor = descriptorsMap.get(chunk.getPath());
      writer.startColumn(columnDescriptor, crstore.getColumnReader(columnDescriptor).getTotalValueCount(), codecName);
      processChunk(reader, writer, chunk, createdBy, codecName);
      writer.endColumn();
    }
    writer.endBlock();
    store = reader.readNextRowGroup();
    blockIndex++;
  }
}

Source File: ParquetFileReaderTest.java From kafka-connect-fs with Apache License 2.0

6 votes

@Override
protected Path createDataFile(ReaderFsTestConfig fsConfig, Object... args) throws IOException {
    FileSystem fs = fsConfig.getFs();
    File parquetFile = File.createTempFile("test-", "." + getFileExtension());

    try (ParquetWriter writer = AvroParquetWriter.<GenericRecord>builder(new Path(parquetFile.toURI()))
            .withConf(fs.getConf()).withWriteMode(ParquetFileWriter.Mode.OVERWRITE).withSchema(readerSchema).build()) {
        IntStream.range(0, NUM_RECORDS).forEach(index -> {
            GenericRecord datum = new GenericData.Record(readerSchema);
            datum.put(FIELD_INDEX, index);
            String uuid = UUID.randomUUID().toString();
            datum.put(FIELD_NAME, String.format("%d_name_%s", index, uuid));
            datum.put(FIELD_SURNAME, String.format("%d_surname_%s", index, uuid));
            try {
                fsConfig.offsetsByIndex().put(index, (long) index);
                writer.write(datum);
            } catch (IOException ioe) {
                throw new RuntimeException(ioe);
            }
        });
    }
    Path path = new Path(new Path(fsConfig.getFsUri()), parquetFile.getName());
    fs.moveFromLocalFile(new Path(parquetFile.getAbsolutePath()), path);
    return path;
}

Source File: TransCompressionCommand.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void execute(CommandLine options) throws Exception {
  super.execute(options);
  List<String> args = options.getArgList();
  Path inPath = new Path(args.get(0));
  Path outPath = new Path(args.get(1));
  CompressionCodecName codecName = CompressionCodecName.valueOf(args.get(2));

  ParquetMetadata metaData = ParquetFileReader.readFooter(conf, inPath, NO_FILTER);
  MessageType schema = metaData.getFileMetaData().getSchema();
  ParquetFileWriter writer = new ParquetFileWriter(conf, schema, outPath, ParquetFileWriter.Mode.CREATE);
  writer.start();

  try (TransParquetFileReader reader = new TransParquetFileReader(HadoopInputFile.fromPath(inPath, conf), HadoopReadOptions.builder(conf).build())) {
    compressionConverter.processBlocks(reader, writer, metaData, schema, metaData.getFileMetaData().getCreatedBy(), codecName);
  } finally {
    writer.end(metaData.getFileMetaData().getKeyValueMetaData());
  }
}

Source File: ParquetHdfsFileSink.java From components with Apache License 2.0

6 votes

@Override
protected void mergeOutput(FileSystem fs, String sourceFolder, String targetFile) throws IOException {
    FileStatus[] sourceStatuses = FileSystemUtil.listSubFiles(fs, sourceFolder);
    List<Path> sourceFiles = new ArrayList<>();
    for (FileStatus sourceStatus : sourceStatuses) {
        sourceFiles.add(sourceStatus.getPath());
    }
    FileMetaData mergedMeta = ParquetFileWriter.mergeMetadataFiles(sourceFiles, fs.getConf()).getFileMetaData();
    ParquetFileWriter writer = new ParquetFileWriter(fs.getConf(), mergedMeta.getSchema(), new Path(targetFile),
            ParquetFileWriter.Mode.CREATE);
    writer.start();
    for (Path input : sourceFiles) {
        writer.appendFile(fs.getConf(), input);
    }
    writer.end(mergedMeta.getKeyValueMetaData());
}

Source File: HoodieParquetWriter.java From hudi with Apache License 2.0

6 votes

public HoodieParquetWriter(String instantTime, Path file, HoodieParquetConfig parquetConfig,
    Schema schema, SparkTaskContextSupplier sparkTaskContextSupplier) throws IOException {
  super(HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf()),
      ParquetFileWriter.Mode.CREATE, parquetConfig.getWriteSupport(), parquetConfig.getCompressionCodecName(),
      parquetConfig.getBlockSize(), parquetConfig.getPageSize(), parquetConfig.getPageSize(),
      ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED,
      ParquetWriter.DEFAULT_WRITER_VERSION, FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  this.file = HoodieWrapperFileSystem.convertToHoodiePath(file, parquetConfig.getHadoopConf());
  this.fs =
      (HoodieWrapperFileSystem) this.file.getFileSystem(FSUtils.registerFileSystem(file, parquetConfig.getHadoopConf()));
  // We cannot accurately measure the snappy compressed output file size. We are choosing a
  // conservative 10%
  // TODO - compute this compression ratio dynamically by looking at the bytes written to the
  // stream and the actual file size reported by HDFS
  this.maxFileSize = parquetConfig.getMaxFileSize()
      + Math.round(parquetConfig.getMaxFileSize() * parquetConfig.getCompressionRatio());
  this.writeSupport = parquetConfig.getWriteSupport();
  this.instantTime = instantTime;
  this.sparkTaskContextSupplier = sparkTaskContextSupplier;
}

Source File: FooterGatherer.java From Bats with Apache License 2.0

5 votes

private static void checkMagicBytes(FileStatus status, byte[] data, int offset) throws IOException {
  for(int i =0, v = offset; i < MAGIC_LENGTH; i++, v++){
    if(ParquetFileWriter.MAGIC[i] != data[v]){
      byte[] magic = ArrayUtils.subarray(data, offset, offset + MAGIC_LENGTH);
      throw new IOException(status.getPath() + " is not a Parquet file. expected magic number at tail " + Arrays.toString(ParquetFileWriter.MAGIC) + " but found " + Arrays.toString(magic));
    }
  }
}

Source File: TestFiltersWithMissingColumns.java From parquet-mr with Apache License 2.0

5 votes

@Before
public void createDataFile() throws Exception {
  File file = temp.newFile("test.parquet");
  this.path = new Path(file.toString());

  MessageType type = Types.buildMessage()
      .required(INT64).named("id")
      .required(BINARY).as(UTF8).named("data")
      .named("test");

  SimpleGroupFactory factory = new SimpleGroupFactory(type);

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(path)
      .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
      .withType(type)
      .build();

  try {
    for (long i = 0; i < 1000; i += 1) {
      Group g = factory.newGroup();
      g.add(0, i);
      g.add(1, "data-" + i);
      writer.write(g);
    }
  } finally {
    writer.close();
  }
}

Source File: PageChecksumDataGenerator.java From parquet-mr with Apache License 2.0

5 votes

public void generateData(Path outFile, int nRows, boolean writeChecksums,
                         CompressionCodecName compression) throws IOException {
  if (exists(configuration, outFile)) {
    System.out.println("File already exists " + outFile);
    return;
  }

  ParquetWriter<Group> writer = ExampleParquetWriter.builder(outFile)
    .withConf(configuration)
    .withWriteMode(ParquetFileWriter.Mode.OVERWRITE)
    .withCompressionCodec(compression)
    .withDictionaryEncoding(true)
    .withType(SCHEMA)
    .withPageWriteChecksumEnabled(writeChecksums)
    .build();

  GroupFactory groupFactory = new SimpleGroupFactory(SCHEMA);
  Random rand = new Random(42);
  for (int i = 0; i < nRows; i++) {
    Group group = groupFactory.newGroup();
    group
      .append("long_field", (long) i)
      .append("binary_field", randomUUID().toString())
      .addGroup("group")
      // Force dictionary encoding by performing modulo
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100)
      .append("int_field", rand.nextInt() % 100);
    writer.write(group);
  }

  writer.close();
}

Source File: MergeCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  // Prepare arguments
  List<String> args = options.getArgList();
  List<Path> inputFiles = getInputFiles(args.subList(0, args.size() - 1));
  Path outputFile = new Path(args.get(args.size() - 1));

  // Merge schema and extraMeta
  FileMetaData mergedMeta = mergedMetadata(inputFiles);
  PrintWriter out = new PrintWriter(Main.out, true);

  // Merge data
  ParquetFileWriter writer = new ParquetFileWriter(conf,
          mergedMeta.getSchema(), outputFile, ParquetFileWriter.Mode.CREATE);
  writer.start();
  boolean tooSmallFilesMerged = false;
  for (Path input: inputFiles) {
    if (input.getFileSystem(conf).getFileStatus(input).getLen() < TOO_SMALL_FILE_THRESHOLD) {
      out.format("Warning: file %s is too small, length: %d\n",
        input,
        input.getFileSystem(conf).getFileStatus(input).getLen());
      tooSmallFilesMerged = true;
    }

    writer.appendFile(HadoopInputFile.fromPath(input, conf));
  }

  if (tooSmallFilesMerged) {
    out.println("Warning: you merged too small files. " +
      "Although the size of the merged file is bigger, it STILL contains small row groups, thus you don't have the advantage of big row groups, " +
      "which usually leads to bad query performance!");
  }
  writer.end(mergedMeta.getKeyValueMetaData());
}

Source File: PruneColumnsCommand.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void execute(CommandLine options) throws Exception {
  List<String> args = options.getArgList();
  Path inputFile = new Path(args.get(0));
  Path outputFile = new Path(args.get(1));
  List<String> cols = args.subList(2, args.size());

  Set<ColumnPath> prunePaths = convertToColumnPaths(cols);

  ParquetMetadata pmd = ParquetFileReader.readFooter(conf, inputFile, ParquetMetadataConverter.NO_FILTER);
  FileMetaData metaData = pmd.getFileMetaData();
  MessageType schema = metaData.getSchema();
  List<String> paths = new ArrayList<>();
  getPaths(schema, paths, null);

  for (String col : cols) {
    if (!paths.contains(col)) {
      LOG.warn("Input column name {} doesn't show up in the schema of file {}", col, inputFile.getName());
    }
  }

  ParquetFileWriter writer = new ParquetFileWriter(conf,
    pruneColumnsInSchema(schema, prunePaths), outputFile, ParquetFileWriter.Mode.CREATE);

  writer.start();
  writer.appendFile(HadoopInputFile.fromPath(inputFile, conf));
  writer.end(metaData.getKeyValueMetaData());
}

Source File: ParquetOutputPlugin.java From embulk-output-parquet with MIT License

5 votes

private ParquetWriter<PageReader> createWriter(PluginTask task, Schema schema, int processorIndex)
{
    //In case of using Frankurt (eu-central-1) with Signature Version 4 Signing Process
    System.setProperty(SDKGlobalConfiguration.ENABLE_S3_SIGV4_SYSTEM_PROPERTY, task.getSignature());

    final TimestampFormatter[] timestampFormatters = Timestamps.newTimestampColumnFormatters(task, schema, task.getColumnOptions());
    final boolean addUTF8 = task.getAddUTF8();

    final Path path = new Path(buildPath(task, processorIndex));
    final CompressionCodecName codec = CompressionCodecName.valueOf(task.getCompressionCodec());
    final int blockSize = task.getBlockSize();
    final int pageSize = task.getPageSize();
    final Configuration conf = createConfiguration(task.getExtraConfigurations(), task.getConfigFiles());
    final boolean overwrite = task.getOverwrite();

    ParquetWriter<PageReader> writer = null;
    try {
        EmbulkWriterBuilder builder = new EmbulkWriterBuilder(path, schema, timestampFormatters, addUTF8)
                .withCompressionCodec(codec)
                .withRowGroupSize(blockSize)
                .withPageSize(pageSize)
                .withDictionaryPageSize(pageSize)
                .withConf(conf);

        if (overwrite) {
            builder.withWriteMode(ParquetFileWriter.Mode.OVERWRITE);
        }

        writer = builder.build();
    } catch (IOException e) {
        Throwables.propagate(e);
    }
    return writer;
}

Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0

5 votes

/**
 * Helper method to create a new {@link ParquetFileWriter} as impersonated user.
 * @throws IOException
 */
private void initRecordWriter() throws IOException {

  this.path = fs.canonicalizePath(partition.qualified(location, prefix + "_" + index + "." + extension));
  parquetFileWriter = new ParquetFileWriter(OutputFile.of(fs, path), checkNotNull(schema), ParquetFileWriter.Mode.CREATE, DEFAULT_BLOCK_SIZE,
      MAX_PADDING_SIZE_DEFAULT, DEFAULT_COLUMN_INDEX_TRUNCATE_LENGTH, true);
  parquetFileWriter.start();
}

Source File: SingletonParquetFooterCache.java From dremio-oss with Apache License 2.0

5 votes

private static void checkMagicBytes(String path, byte[] data, int offset) throws IOException {
  for(int i =0, v = offset; i < MAGIC_LENGTH; i++, v++){
    if(ParquetFileWriter.MAGIC[i] != data[v]){
      byte[] magic = ArrayUtils.subarray(data, offset, offset + MAGIC_LENGTH);
      throw new IOException(path + " is not a Parquet file. expected magic number at tail " + Arrays.toString(ParquetFileWriter.MAGIC) + " but found " + Arrays.toString(magic));
    }
  }
}

Source File: ParquetReaderUtility.java From dremio-oss with Apache License 2.0

5 votes

public static Map<String, SchemaElement> getColNameToSchemaElementMapping(ParquetMetadata footer) {
  HashMap<String, SchemaElement> schemaElements = new HashMap<>();
  FileMetaData fileMetaData = new ParquetMetadataConverter().toParquetMetadata(ParquetFileWriter.CURRENT_VERSION, footer);
  for (SchemaElement se : fileMetaData.getSchema()) {
    schemaElements.put(se.getName(), se);
  }
  return schemaElements;
}

Source File: DataLoad.java From arvo2parquet with MIT License

5 votes

private static void extractMetaDataFooter(final Path parquetFilePath) throws IOException {
  try (final ParquetFileReader rdr = ParquetFileReader.open(nioPathToInputFile(parquetFilePath))) {
    final ParquetMetadata footer = rdr.getFooter();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE + "_dup.parquet");
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(footer, out);
    }
  }
}

Source File: DataLoad.java From arvo2parquet with MIT License

5 votes

private static void writeToParquet(@Nonnull final Schema schema,
                                   @Nonnull final Path fileToWrite,
                                   @Nonnull final GenericDataRecordSink sink) throws IOException
{
  try (final ParquetWriter<GenericData.Record> writer = createParquetWriterInstance(schema, fileToWrite)) {
    //noinspection StatementWithEmptyBody
    do ; while(sink.accept(writer::write));
    writer.close();
    final Path metaDataOutPath = Paths.get(ParquetFileWriter.PARQUET_METADATA_FILE);
    Files.deleteIfExists(metaDataOutPath);
    try (final PositionOutputStream out = nioPathToOutputFile(metaDataOutPath).createOrOverwrite(0)) {
      serializeFooter(writer.getFooter(), out);
    }
  }
}

Source File: ProtoParquetWriterWithOffset.java From garmadon with Apache License 2.0

5 votes

protected void mergeToFinalPath(Path lastAvailableFinalPath, Path finalPath) throws IOException {
    try (ParquetFileReader reader = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath)) {
        MessageType schema = reader.getFileMetaData().getSchema();
        if (!checkSchemaEquality(schema)) {
            LOGGER.warn("Schema between last available final file ({}) and temp file ({}) are not identical. We can't merge them",
                lastAvailableFinalPath, temporaryHdfsPath);
            moveToFinalPath(temporaryHdfsPath, finalPath);
        } else {
            Path mergedTempFile = new Path(temporaryHdfsPath.toString() + ".merged");

            if (fs.isFile(mergedTempFile)) fs.delete(mergedTempFile, false);

            Map<String, String> existingMetadata = reader.getFileMetaData().getKeyValueMetaData();
            Map<String, String> newMetadata = new HashMap<>(existingMetadata);
            newMetadata.put(LATEST_TIMESTAMP_META_KEY, String.valueOf(latestTimestamp));

            ParquetFileWriter writerPF = new ParquetFileWriter(fs.getConf(), schema, mergedTempFile);
            writerPF.start();
            try (
                ParquetFileReader dest = ParquetFileReader.open(fs.getConf(), lastAvailableFinalPath);
                ParquetFileReader temp = ParquetFileReader.open(fs.getConf(), temporaryHdfsPath)
            ) {
                dest.appendTo(writerPF);
                temp.appendTo(writerPF);
                writerPF.end(newMetadata);
            }

            moveToFinalPath(mergedTempFile, lastAvailableFinalPath);
            try {
                fs.delete(temporaryHdfsPath, false);
                // This file is in a temp folder that should be deleted at exit so we should not throw exception here
            } catch (IOException ignored) {
            }
        }
    }
}

Source File: FooterGatherer.java From Bats with Apache License 2.0

5 votes

/**
 * An updated footer reader that tries to read the entire footer without knowing the length.
 * This should reduce the amount of seek/read roundtrips in most workloads.
 * @param fs
 * @param status
 * @return
 * @throws IOException
 */
public static Footer readFooter(final Configuration config, final FileStatus status) throws IOException {
  final FileSystem fs = status.getPath().getFileSystem(config);
  try(FSDataInputStream file = fs.open(status.getPath())) {

    final long fileLength = status.getLen();
    Preconditions.checkArgument(fileLength >= MIN_FILE_SIZE, "%s is not a Parquet file (too small)", status.getPath());

    int len = (int) Math.min( fileLength, (long) DEFAULT_READ_SIZE);
    byte[] footerBytes = new byte[len];
    readFully(file, fileLength - len, footerBytes, 0, len);

    checkMagicBytes(status, footerBytes, footerBytes.length - ParquetFileWriter.MAGIC.length);
    final int size = BytesUtils.readIntLittleEndian(footerBytes, footerBytes.length - FOOTER_METADATA_SIZE);

    if(size > footerBytes.length - FOOTER_METADATA_SIZE){
      // if the footer is larger than our initial read, we need to read the rest.
      byte[] origFooterBytes = footerBytes;
      int origFooterRead = origFooterBytes.length - FOOTER_METADATA_SIZE;

      footerBytes = new byte[size];

      readFully(file, fileLength - size - FOOTER_METADATA_SIZE, footerBytes, 0, size - origFooterRead);
      System.arraycopy(origFooterBytes, 0, footerBytes, size - origFooterRead, origFooterRead);
    }else{
      int start = footerBytes.length - (size + FOOTER_METADATA_SIZE);
      footerBytes = ArrayUtils.subarray(footerBytes, start, start + size);
    }

    final ByteArrayInputStream from = new ByteArrayInputStream(footerBytes);
    ParquetMetadata metadata = ParquetFormatPlugin.parquetMetadataConverter.readParquetMetadata(from, NO_FILTER);
    Footer footer = new Footer(status.getPath(), metadata);
    return footer;
  }
}

Source File: FooterGatherer.java From Bats with Apache License 2.0

5 votes

public static List<Footer> getFooters(final Configuration conf, List<FileStatus> statuses, int parallelism) throws IOException {
  final List<TimedCallable<Footer>> readers = new ArrayList<>();
  final List<Footer> foundFooters = new ArrayList<>();
  for (FileStatus status : statuses) {


    if (status.isDirectory()){
      // first we check for summary file.
      FileSystem fs = status.getPath().getFileSystem(conf);

      final Path summaryPath = new Path(status.getPath(), ParquetFileWriter.PARQUET_METADATA_FILE);
      if (fs.exists(summaryPath)){
        FileStatus summaryStatus = fs.getFileStatus(summaryPath);
        foundFooters.addAll(ParquetFileReader.readSummaryFile(conf, summaryStatus));
        continue;
      }

      // else we handle as normal file.
      for (FileStatus inStatus : DrillFileSystemUtil.listFiles(fs, status.getPath(), false)){
        readers.add(new FooterReader(conf, inStatus));
      }
    } else {
      readers.add(new FooterReader(conf, status));
    }

  }
  if(!readers.isEmpty()){
    foundFooters.addAll(TimedCallable.run("Fetch Parquet Footers", logger, readers, parallelism));
  }

  return foundFooters;
}

Source File: MergeCommand.java From parquet-mr with Apache License 2.0

4 votes

private FileMetaData mergedMetadata(List<Path> inputFiles) throws IOException {
  return ParquetFileWriter.mergeMetadataFiles(inputFiles, conf).getFileMetaData();
}

Source File: Parquet.java From iceberg with Apache License 2.0

4 votes

public WriteBuilder overwrite(boolean enabled) {
  this.writeMode = enabled ? ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE;
  return this;
}

Source File: CompressionConverter.java From parquet-mr with Apache License 2.0

4 votes

private void processChunk(TransParquetFileReader reader, ParquetFileWriter writer, ColumnChunkMetaData chunk,
                          String createdBy, CompressionCodecName codecName) throws IOException {
  CompressionCodecFactory codecFactory = HadoopCodecs.newFactory(0);
  CompressionCodecFactory.BytesInputDecompressor decompressor = codecFactory.getDecompressor(chunk.getCodec());
  CompressionCodecFactory.BytesInputCompressor compressor = codecFactory.getCompressor(codecName);
  ColumnIndex columnIndex = reader.readColumnIndex(chunk);
  OffsetIndex offsetIndex = reader.readOffsetIndex(chunk);

  reader.setStreamPosition(chunk.getStartingPos());
  DictionaryPage dictionaryPage = null;
  long readValues = 0;
  Statistics statistics = null;
  ParquetMetadataConverter converter = new ParquetMetadataConverter();
  int pageIndex = 0;
  long totalChunkValues = chunk.getValueCount();
  while (readValues < totalChunkValues) {
    PageHeader pageHeader = reader.readPageHeader();
    int compressedPageSize = pageHeader.getCompressed_page_size();
    byte[] pageLoad;
    switch (pageHeader.type) {
      case DICTIONARY_PAGE:
        if (dictionaryPage != null) {
          throw new IOException("has more than one dictionary page in column chunk");
        }
        DictionaryPageHeader dictPageHeader = pageHeader.dictionary_page_header;
        pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
        writer.writeDictionaryPage(new DictionaryPage(BytesInput.from(pageLoad),
                                                 pageHeader.getUncompressed_page_size(),
                                                 dictPageHeader.getNum_values(),
                                                 converter.getEncoding(dictPageHeader.getEncoding())));
        break;
      case DATA_PAGE:
        DataPageHeader headerV1 = pageHeader.data_page_header;
        pageLoad = translatePageLoad(reader, true, compressor, decompressor, pageHeader.getCompressed_page_size(), pageHeader.getUncompressed_page_size());
        statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV1.getStatistics(), columnIndex, pageIndex, converter);
        readValues += headerV1.getNum_values();
        if (offsetIndex != null) {
          long rowCount = 1 + offsetIndex.getLastRowIndex(pageIndex, totalChunkValues) - offsetIndex.getFirstRowIndex(pageIndex);
          writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()),
            pageHeader.getUncompressed_page_size(),
            BytesInput.from(pageLoad),
            statistics,
            toIntWithCheck(rowCount),
            converter.getEncoding(headerV1.getRepetition_level_encoding()),
            converter.getEncoding(headerV1.getDefinition_level_encoding()),
            converter.getEncoding(headerV1.getEncoding()));
        } else {
          writer.writeDataPage(toIntWithCheck(headerV1.getNum_values()),
            pageHeader.getUncompressed_page_size(),
            BytesInput.from(pageLoad),
            statistics,
            converter.getEncoding(headerV1.getRepetition_level_encoding()),
            converter.getEncoding(headerV1.getDefinition_level_encoding()),
            converter.getEncoding(headerV1.getEncoding()));
        }
        pageIndex++;
        break;
      case DATA_PAGE_V2:
        DataPageHeaderV2 headerV2 = pageHeader.data_page_header_v2;
        int rlLength = headerV2.getRepetition_levels_byte_length();
        BytesInput rlLevels = readBlockAllocate(rlLength, reader);
        int dlLength = headerV2.getDefinition_levels_byte_length();
        BytesInput dlLevels = readBlockAllocate(dlLength, reader);
        int payLoadLength = pageHeader.getCompressed_page_size() - rlLength - dlLength;
        int rawDataLength = pageHeader.getUncompressed_page_size() - rlLength - dlLength;
        pageLoad = translatePageLoad(reader, headerV2.is_compressed, compressor, decompressor, payLoadLength, rawDataLength);
        statistics = convertStatistics(createdBy, chunk.getPrimitiveType(), headerV2.getStatistics(), columnIndex, pageIndex, converter);
        readValues += headerV2.getNum_values();
        writer.writeDataPageV2(headerV2.getNum_rows(),
          headerV2.getNum_nulls(),
          headerV2.getNum_values(),
          rlLevels,
          dlLevels,
          converter.getEncoding(headerV2.getEncoding()),
          BytesInput.from(pageLoad),
          rawDataLength,
          statistics);
        pageIndex++;
        break;
      default:
        LOG.debug("skipping page of type {} of size {}", pageHeader.getType(), compressedPageSize);
        break;
    }
  }
}

Source File: ParquetRecordWriterUtil.java From presto with Apache License 2.0

4 votes

public static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties, ConnectorSession session)
        throws IOException, ReflectiveOperationException
{
    conf.setLong(ParquetOutputFormat.BLOCK_SIZE, getParquetWriterBlockSize(session).toBytes());
    conf.setLong(ParquetOutputFormat.PAGE_SIZE, getParquetWriterPageSize(session).toBytes());

    RecordWriter recordWriter = createParquetWriter(target, conf, properties);

    Object realWriter = REAL_WRITER_FIELD.get(recordWriter);
    Object internalWriter = INTERNAL_WRITER_FIELD.get(realWriter);
    ParquetFileWriter fileWriter = (ParquetFileWriter) FILE_WRITER_FIELD.get(internalWriter);

    return new ExtendedRecordWriter()
    {
        private long length;

        @Override
        public long getWrittenBytes()
        {
            return length;
        }

        @Override
        public void write(Writable value)
                throws IOException
        {
            recordWriter.write(value);
            length = fileWriter.getPos();
        }

        @Override
        public void close(boolean abort)
                throws IOException
        {
            recordWriter.close(abort);
            if (!abort) {
                length = fileWriter.getPos();
            }
        }
    };
}

Source File: ConvertCSVCommand.java From parquet-mr with Apache License 2.0

4 votes

@Override
@SuppressWarnings("unchecked")
public int run() throws IOException {
  Preconditions.checkArgument(targets != null && targets.size() == 1,
      "CSV path is required.");

  if (header != null) {
    // if a header is given on the command line, don't assume one is in the file
    noHeader = true;
  }

  CSVProperties props = new CSVProperties.Builder()
      .delimiter(delimiter)
      .escape(escape)
      .quote(quote)
      .header(header)
      .hasHeader(!noHeader)
      .linesToSkip(linesToSkip)
      .charset(charsetName)
      .build();

  String source = targets.get(0);

  Schema csvSchema;
  if (avroSchemaFile != null) {
    csvSchema = Schemas.fromAvsc(open(avroSchemaFile));
  } else {
    Set<String> required = ImmutableSet.of();
    if (requiredFields != null) {
      required = ImmutableSet.copyOf(requiredFields);
    }

    String filename = new File(source).getName();
    String recordName;
    if (filename.contains(".")) {
      recordName = filename.substring(0, filename.indexOf("."));
    } else {
      recordName = filename;
    }

    csvSchema = AvroCSV.inferNullableSchema(
        recordName, open(source), props, required);
  }

  long count = 0;
  try (AvroCSVReader<Record> reader = new AvroCSVReader<>(
      open(source), props, csvSchema, Record.class, true)) {
      CompressionCodecName codec = Codecs.parquetCodec(compressionCodecName);
    try (ParquetWriter<Record> writer = AvroParquetWriter
        .<Record>builder(qualifiedPath(outputPath))
        .withWriterVersion(v2 ? PARQUET_2_0 : PARQUET_1_0)
        .withWriteMode(overwrite ?
            ParquetFileWriter.Mode.OVERWRITE : ParquetFileWriter.Mode.CREATE)
        .withCompressionCodec(codec)
        .withDictionaryEncoding(true)
        .withDictionaryPageSize(dictionaryPageSize)
        .withPageSize(pageSize)
        .withRowGroupSize(rowGroupSize)
        .withDataModel(GenericData.get())
        .withConf(getConf())
        .withSchema(csvSchema)
        .build()) {
      for (Record record : reader) {
        writer.write(record);
      }
    } catch (RuntimeException e) {
      throw new RuntimeException("Failed on record " + count, e);
    }
  }

  return 0;
}

Source File: ParquetConfig.java From nifi with Apache License 2.0

4 votes

public ParquetFileWriter.Mode getWriterMode() {
    return writerMode;
}

org.apache.parquet.hadoop.ParquetFileWriter Java Examples