org.apache.parquet.hadoop.metadata.CompressionCodecName#GZIP

Source File: HiveTestUtil.java From hudi with Apache License 2.0

6 votes

@SuppressWarnings({"unchecked", "deprecation"})
private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple)
    throws IOException, URISyntaxException {
  Schema schema = getTestDataSchema(isParquetSchemaSimple);
  org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema);
  BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter);
  ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024,
      ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED,
      ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf());

  List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100)
      : SchemaTestUtil.generateEvolvedTestRecords(100, 100));
  testRecords.forEach(s -> {
    try {
      writer.write(s);
    } catch (IOException e) {
      fail("IOException while writing test records as parquet" + e.toString());
    }
  });
  writer.close();
}

Source File: TestHoodieAvroWriteSupport.java From hudi with Apache License 2.0

6 votes

@Test
public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException {
  List<String> rowKeys = new ArrayList<>();
  for (int i = 0; i < 1000; i++) {
    rowKeys.add(UUID.randomUUID().toString());
  }
  String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString();
  Schema schema = HoodieAvroUtils.getRecordKeySchema();
  BloomFilter filter = BloomFilterFactory.createBloomFilter(
      1000, 0.0001, 10000,
      BloomFilterTypeCode.SIMPLE.name());
  HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(
      new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}

Source File: TestParquetUtils.java From hudi with Apache License 2.0

6 votes

private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception {
  // Write out a parquet file
  BloomFilter filter = BloomFilterFactory
      .createBloomFilter(1000, 0.0001, 10000, typeCode);
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
  ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP,
      120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE);
  for (String rowKey : rowKeys) {
    GenericRecord rec = new GenericData.Record(schema);
    rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey);
    if (addPartitionPathField) {
      rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath);
    }
    writer.write(rec);
    writeSupport.add(rowKey);
  }
  writer.close();
}

Source File: ParquetAppender.java From kite with Apache License 2.0

6 votes

private CompressionCodecName getCompressionCodecName() {
  switch (compressionType) {
    case Snappy:
      return CompressionCodecName.SNAPPY;

    case Lzo:
      return CompressionCodecName.LZO;

    case Deflate:
      return CompressionCodecName.GZIP;

    default:
      throw new IllegalArgumentException(String.format(
          "Unsupported compression format %s. Supported formats: %s",
          compressionType.getName(), Arrays.toString(
              Formats.PARQUET.getSupportedCompressionTypes().toArray())));
  }
}

Source File: ParquetFileWriterFactory.java From presto with Apache License 2.0

5 votes

private static CompressionCodecName getCompression(JobConf configuration)
{
    String compressionName = configuration.get(ParquetOutputFormat.COMPRESSION);
    if (compressionName == null) {
        return CompressionCodecName.GZIP;
    }
    return CompressionCodecName.valueOf(compressionName);
}

Source File: HoodieClientTestUtils.java From hudi with Apache License 2.0

5 votes

public static String writeParquetFile(String basePath, String partitionPath, String filename,
                                      List<HoodieRecord> records, Schema schema, BloomFilter filter, boolean createCommitTime) throws IOException {

  if (filter == null) {
    filter = BloomFilterFactory
        .createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name());
  }
  HoodieAvroWriteSupport writeSupport =
      new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter);
  String instantTime = FSUtils.getCommitTime(filename);
  HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP,
      ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024,
      HoodieTestUtils.getDefaultHadoopConf(), Double.valueOf(HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO));
  HoodieParquetWriter writer =
      new HoodieParquetWriter(instantTime, new Path(basePath + "/" + partitionPath + "/" + filename), config,
              schema, new SparkTaskContextSupplier());
  int seqId = 1;
  for (HoodieRecord record : records) {
    GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get();
    HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, instantTime, "" + seqId++);
    HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename);
    writer.writeAvro(record.getRecordKey(), avroRecord);
    filter.add(record.getRecordKey());
  }
  writer.close();

  if (createCommitTime) {
    HoodieTestUtils.createMetadataFolder(basePath);
    HoodieTestUtils.createCommitFiles(basePath, instantTime);
  }
  return filename;
}

Source File: TestParquetInLining.java From hudi with Apache License 2.0

5 votes

@Test
public void testSimpleInlineFileSystem() throws IOException {
  Path outerInMemFSPath = getRandomOuterInMemPath();
  Path outerPath = new Path(FILE_SCHEME + outerInMemFSPath.toString().substring(outerInMemFSPath.toString().indexOf(':')));
  generatedPath = outerPath;
  ParquetWriter inlineWriter = new AvroParquetWriter(outerInMemFSPath, HoodieTestDataGenerator.AVRO_SCHEMA,
      CompressionCodecName.GZIP, 100 * 1024 * 1024, 1024 * 1024, true, inMemoryConf);
  // write few records
  List<GenericRecord> recordsToWrite = getParquetHoodieRecords();
  for (GenericRecord rec : recordsToWrite) {
    inlineWriter.write(rec);
  }
  inlineWriter.close();
  byte[] inlineBytes = getBytesToInline(outerInMemFSPath);
  long startOffset = generateOuterFile(outerPath, inlineBytes);

  long inlineLength = inlineBytes.length;

  // Generate phantom inline file
  Path inlinePath = getPhantomFile(outerPath, startOffset, inlineLength);

  // instantiate Parquet reader
  ParquetReader inLineReader = AvroParquetReader.builder(inlinePath).withConf(inlineConf).build();
  List<GenericRecord> records = readParquetGenericRecords(inLineReader);
  assertArrayEquals(recordsToWrite.toArray(), records.toArray());
  inLineReader.close();
}

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

5 votes

private ColumnChunkMetaData createColumnChunkMetaData() {
  Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.BINARY;
  ColumnPath p = ColumnPath.get("foo");
  CompressionCodecName c = CompressionCodecName.GZIP;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s,
          0, 0, 0, 0, 0);
  return md;
}

Source File: ParquetRecordWriter.java From Bats with Apache License 2.0

4 votes

@Override
public void init(Map<String, String> writerOptions) throws IOException {
  this.location = writerOptions.get("location");
  this.prefix = writerOptions.get("prefix");

  fs = FileSystem.get(conf);
  blockSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_BLOCK_SIZE));
  pageSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_PAGE_SIZE));
  dictionaryPageSize= Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_DICT_PAGE_SIZE));
  String codecName = writerOptions.get(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE).toLowerCase();
  switch(codecName) {
  case "snappy":
    codec = CompressionCodecName.SNAPPY;
    break;
  case "lzo":
    codec = CompressionCodecName.LZO;
    break;
  case "gzip":
    codec = CompressionCodecName.GZIP;
    break;
  case "none":
  case "uncompressed":
    codec = CompressionCodecName.UNCOMPRESSED;
    break;
  default:
    throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
  }

  String logicalTypeNameForDecimals = writerOptions.get(ExecConstants.PARQUET_WRITER_LOGICAL_TYPE_FOR_DECIMALS).toLowerCase();
  switch (logicalTypeNameForDecimals) {
    case "fixed_len_byte_array":
      logicalTypeForDecimals = PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
      break;
    case "binary":
      logicalTypeForDecimals = PrimitiveTypeName.BINARY;
      break;
    default:
      throw new UnsupportedOperationException(
          String.format(
              "Unsupported logical type for decimals: %s\n" +
              "Supported types: ['fixed_len_byte_array', 'binary']", codecName));
  }

  enableDictionary = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING));
  useSingleFSBlock = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_SINGLE_FS_BLOCK));
  usePrimitiveTypesForDecimals = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_PRIMITIVE_TYPES_FOR_DECIMALS));

  if (useSingleFSBlock) {
    // Round up blockSize to multiple of 64K.
    blockSize = (int)ceil((double)blockSize/BLOCKSIZE_MULTIPLE) * BLOCKSIZE_MULTIPLE;
  }
}

Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0

4 votes

public ParquetRecordWriter(OperatorContext context, ParquetWriter writer, ParquetFormatConfig config) throws OutOfMemoryException{
  this.context = context;
  this.codecAllocator = context.getAllocator().newChildAllocator("ParquetCodecFactory", 0, Long.MAX_VALUE);
  this.columnEncoderAllocator = context.getAllocator().newChildAllocator("ParquetColEncoder", 0, Long.MAX_VALUE);
  this.codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(),
      new ParquetDirectByteBufferAllocator(codecAllocator), pageSize);
  this.extraMetaData.put(DREMIO_VERSION_PROPERTY, DremioVersionInfo.getVersion());
  this.extraMetaData.put(IS_DATE_CORRECT_PROPERTY, "true");

  this.plugin = writer.getFormatPlugin().getFsPlugin();
  this.queryUser = writer.getProps().getUserName();

  FragmentHandle handle = context.getFragmentHandle();
  String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId());

  this.location = writer.getLocation();
  this.prefix = fragmentId;
  this.extension = config.outputExtension;
  if (writer.getOptions() != null) {
    this.partitionColumns = writer.getOptions().getPartitionColumns();
    this.isIcebergWriter = (writer.getOptions().getIcebergWriterOperation() != WriterOptions.IcebergWriterOperation.NONE);
  } else {
    this.partitionColumns = null;
    this.isIcebergWriter = false;
  }

  if (this.isIcebergWriter && writer.getOptions().getExtendedProperty() != null) {
    initIcebergColumnIDList(writer.getOptions().getExtendedProperty());
  }

  memoryThreshold = (int) context.getOptions().getOption(ExecConstants.PARQUET_MEMORY_THRESHOLD_VALIDATOR);
  blockSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_BLOCK_SIZE_VALIDATOR);
  pageSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_PAGE_SIZE_VALIDATOR);
  final String codecName = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE_VALIDATOR).toLowerCase();
  switch(codecName) {
  case "snappy":
    codec = CompressionCodecName.SNAPPY;
    break;
  case "lzo":
    codec = CompressionCodecName.LZO;
    break;
  case "gzip":
    codec = CompressionCodecName.GZIP;
    break;
  case "none":
  case "uncompressed":
    codec = CompressionCodecName.UNCOMPRESSED;
    break;
  default:
    throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
  }

  enableDictionary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_VALIDATOR);
  enableDictionaryForBinary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_BINARY_TYPE_VALIDATOR);
  maxPartitions = context.getOptions().getOption(ExecConstants.PARQUET_MAXIMUM_PARTITIONS_VALIDATOR);
  minRecordsForFlush = context.getOptions().getOption(ExecConstants.PARQUET_MIN_RECORDS_FOR_FLUSH_VALIDATOR);
  parquetFileWriteTimeThresholdMilliSecs = (int)context.getOptions().getOption(ExecConstants.PARQUET_WRITE_TIME_THRESHOLD_MILLI_SECS_VALIDATOR);
  parquetFileWriteIoRateThresholdMbps = context.getOptions().getOption(ExecConstants.PARQUET_WRITE_IO_RATE_THRESHOLD_MBPS_VALIDATOR);
}

Java Code Examples for org.apache.parquet.hadoop.metadata.CompressionCodecName#GZIP