Java Code Examples for org.apache.parquet.hadoop.metadata.CompressionCodecName#GZIP
The following examples show how to use
org.apache.parquet.hadoop.metadata.CompressionCodecName#GZIP .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: HiveTestUtil.java From hudi with Apache License 2.0 | 6 votes |
@SuppressWarnings({"unchecked", "deprecation"}) private static void generateParquetData(Path filePath, boolean isParquetSchemaSimple) throws IOException, URISyntaxException { Schema schema = getTestDataSchema(isParquetSchemaSimple); org.apache.parquet.schema.MessageType parquetSchema = new AvroSchemaConverter().convert(schema); BloomFilter filter = BloomFilterFactory.createBloomFilter(1000, 0.0001, -1, BloomFilterTypeCode.SIMPLE.name()); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(parquetSchema, schema, filter); ParquetWriter writer = new ParquetWriter(filePath, writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, ParquetWriter.DEFAULT_IS_VALIDATING_ENABLED, ParquetWriter.DEFAULT_WRITER_VERSION, fileSystem.getConf()); List<IndexedRecord> testRecords = (isParquetSchemaSimple ? SchemaTestUtil.generateTestRecords(0, 100) : SchemaTestUtil.generateEvolvedTestRecords(100, 100)); testRecords.forEach(s -> { try { writer.write(s); } catch (IOException e) { fail("IOException while writing test records as parquet" + e.toString()); } }); writer.close(); }
Example 2
Source File: TestHoodieAvroWriteSupport.java From hudi with Apache License 2.0 | 6 votes |
@Test public void testAddKey(@TempDir java.nio.file.Path tempDir) throws IOException { List<String> rowKeys = new ArrayList<>(); for (int i = 0; i < 1000; i++) { rowKeys.add(UUID.randomUUID().toString()); } String filePath = tempDir.resolve("test.parquet").toAbsolutePath().toString(); Schema schema = HoodieAvroUtils.getRecordKeySchema(); BloomFilter filter = BloomFilterFactory.createBloomFilter( 1000, 0.0001, 10000, BloomFilterTypeCode.SIMPLE.name()); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport( new AvroSchemaConverter().convert(schema), schema, filter); ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); for (String rowKey : rowKeys) { GenericRecord rec = new GenericData.Record(schema); rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey); writer.write(rec); writeSupport.add(rowKey); } writer.close(); }
Example 3
Source File: TestParquetUtils.java From hudi with Apache License 2.0 | 6 votes |
private void writeParquetFile(String typeCode, String filePath, List<String> rowKeys, Schema schema, boolean addPartitionPathField, String partitionPath) throws Exception { // Write out a parquet file BloomFilter filter = BloomFilterFactory .createBloomFilter(1000, 0.0001, 10000, typeCode); HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); ParquetWriter writer = new ParquetWriter(new Path(filePath), writeSupport, CompressionCodecName.GZIP, 120 * 1024 * 1024, ParquetWriter.DEFAULT_PAGE_SIZE); for (String rowKey : rowKeys) { GenericRecord rec = new GenericData.Record(schema); rec.put(HoodieRecord.RECORD_KEY_METADATA_FIELD, rowKey); if (addPartitionPathField) { rec.put(HoodieRecord.PARTITION_PATH_METADATA_FIELD, partitionPath); } writer.write(rec); writeSupport.add(rowKey); } writer.close(); }
Example 4
Source File: ParquetAppender.java From kite with Apache License 2.0 | 6 votes |
private CompressionCodecName getCompressionCodecName() { switch (compressionType) { case Snappy: return CompressionCodecName.SNAPPY; case Lzo: return CompressionCodecName.LZO; case Deflate: return CompressionCodecName.GZIP; default: throw new IllegalArgumentException(String.format( "Unsupported compression format %s. Supported formats: %s", compressionType.getName(), Arrays.toString( Formats.PARQUET.getSupportedCompressionTypes().toArray()))); } }
Example 5
Source File: ParquetFileWriterFactory.java From presto with Apache License 2.0 | 5 votes |
private static CompressionCodecName getCompression(JobConf configuration) { String compressionName = configuration.get(ParquetOutputFormat.COMPRESSION); if (compressionName == null) { return CompressionCodecName.GZIP; } return CompressionCodecName.valueOf(compressionName); }
Example 6
Source File: HoodieClientTestUtils.java From hudi with Apache License 2.0 | 5 votes |
public static String writeParquetFile(String basePath, String partitionPath, String filename, List<HoodieRecord> records, Schema schema, BloomFilter filter, boolean createCommitTime) throws IOException { if (filter == null) { filter = BloomFilterFactory .createBloomFilter(10000, 0.0000001, -1, BloomFilterTypeCode.SIMPLE.name()); } HoodieAvroWriteSupport writeSupport = new HoodieAvroWriteSupport(new AvroSchemaConverter().convert(schema), schema, filter); String instantTime = FSUtils.getCommitTime(filename); HoodieParquetConfig config = new HoodieParquetConfig(writeSupport, CompressionCodecName.GZIP, ParquetWriter.DEFAULT_BLOCK_SIZE, ParquetWriter.DEFAULT_PAGE_SIZE, 120 * 1024 * 1024, HoodieTestUtils.getDefaultHadoopConf(), Double.valueOf(HoodieStorageConfig.DEFAULT_STREAM_COMPRESSION_RATIO)); HoodieParquetWriter writer = new HoodieParquetWriter(instantTime, new Path(basePath + "/" + partitionPath + "/" + filename), config, schema, new SparkTaskContextSupplier()); int seqId = 1; for (HoodieRecord record : records) { GenericRecord avroRecord = (GenericRecord) record.getData().getInsertValue(schema).get(); HoodieAvroUtils.addCommitMetadataToRecord(avroRecord, instantTime, "" + seqId++); HoodieAvroUtils.addHoodieKeyToRecord(avroRecord, record.getRecordKey(), record.getPartitionPath(), filename); writer.writeAvro(record.getRecordKey(), avroRecord); filter.add(record.getRecordKey()); } writer.close(); if (createCommitTime) { HoodieTestUtils.createMetadataFolder(basePath); HoodieTestUtils.createCommitFiles(basePath, instantTime); } return filename; }
Example 7
Source File: TestParquetInLining.java From hudi with Apache License 2.0 | 5 votes |
@Test public void testSimpleInlineFileSystem() throws IOException { Path outerInMemFSPath = getRandomOuterInMemPath(); Path outerPath = new Path(FILE_SCHEME + outerInMemFSPath.toString().substring(outerInMemFSPath.toString().indexOf(':'))); generatedPath = outerPath; ParquetWriter inlineWriter = new AvroParquetWriter(outerInMemFSPath, HoodieTestDataGenerator.AVRO_SCHEMA, CompressionCodecName.GZIP, 100 * 1024 * 1024, 1024 * 1024, true, inMemoryConf); // write few records List<GenericRecord> recordsToWrite = getParquetHoodieRecords(); for (GenericRecord rec : recordsToWrite) { inlineWriter.write(rec); } inlineWriter.close(); byte[] inlineBytes = getBytesToInline(outerInMemFSPath); long startOffset = generateOuterFile(outerPath, inlineBytes); long inlineLength = inlineBytes.length; // Generate phantom inline file Path inlinePath = getPhantomFile(outerPath, startOffset, inlineLength); // instantiate Parquet reader ParquetReader inLineReader = AvroParquetReader.builder(inlinePath).withConf(inlineConf).build(); List<GenericRecord> records = readParquetGenericRecords(inLineReader); assertArrayEquals(recordsToWrite.toArray(), records.toArray()); inLineReader.close(); }
Example 8
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
private ColumnChunkMetaData createColumnChunkMetaData() { Set<org.apache.parquet.column.Encoding> e = new HashSet<org.apache.parquet.column.Encoding>(); PrimitiveTypeName t = PrimitiveTypeName.BINARY; ColumnPath p = ColumnPath.get("foo"); CompressionCodecName c = CompressionCodecName.GZIP; BinaryStatistics s = new BinaryStatistics(); ColumnChunkMetaData md = ColumnChunkMetaData.get(p, t, c, e, s, 0, 0, 0, 0, 0); return md; }
Example 9
Source File: ParquetRecordWriter.java From Bats with Apache License 2.0 | 4 votes |
@Override public void init(Map<String, String> writerOptions) throws IOException { this.location = writerOptions.get("location"); this.prefix = writerOptions.get("prefix"); fs = FileSystem.get(conf); blockSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_BLOCK_SIZE)); pageSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_PAGE_SIZE)); dictionaryPageSize= Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_DICT_PAGE_SIZE)); String codecName = writerOptions.get(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE).toLowerCase(); switch(codecName) { case "snappy": codec = CompressionCodecName.SNAPPY; break; case "lzo": codec = CompressionCodecName.LZO; break; case "gzip": codec = CompressionCodecName.GZIP; break; case "none": case "uncompressed": codec = CompressionCodecName.UNCOMPRESSED; break; default: throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName)); } String logicalTypeNameForDecimals = writerOptions.get(ExecConstants.PARQUET_WRITER_LOGICAL_TYPE_FOR_DECIMALS).toLowerCase(); switch (logicalTypeNameForDecimals) { case "fixed_len_byte_array": logicalTypeForDecimals = PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY; break; case "binary": logicalTypeForDecimals = PrimitiveTypeName.BINARY; break; default: throw new UnsupportedOperationException( String.format( "Unsupported logical type for decimals: %s\n" + "Supported types: ['fixed_len_byte_array', 'binary']", codecName)); } enableDictionary = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING)); useSingleFSBlock = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_SINGLE_FS_BLOCK)); usePrimitiveTypesForDecimals = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_PRIMITIVE_TYPES_FOR_DECIMALS)); if (useSingleFSBlock) { // Round up blockSize to multiple of 64K. blockSize = (int)ceil((double)blockSize/BLOCKSIZE_MULTIPLE) * BLOCKSIZE_MULTIPLE; } }
Example 10
Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0 | 4 votes |
public ParquetRecordWriter(OperatorContext context, ParquetWriter writer, ParquetFormatConfig config) throws OutOfMemoryException{ this.context = context; this.codecAllocator = context.getAllocator().newChildAllocator("ParquetCodecFactory", 0, Long.MAX_VALUE); this.columnEncoderAllocator = context.getAllocator().newChildAllocator("ParquetColEncoder", 0, Long.MAX_VALUE); this.codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(), new ParquetDirectByteBufferAllocator(codecAllocator), pageSize); this.extraMetaData.put(DREMIO_VERSION_PROPERTY, DremioVersionInfo.getVersion()); this.extraMetaData.put(IS_DATE_CORRECT_PROPERTY, "true"); this.plugin = writer.getFormatPlugin().getFsPlugin(); this.queryUser = writer.getProps().getUserName(); FragmentHandle handle = context.getFragmentHandle(); String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId()); this.location = writer.getLocation(); this.prefix = fragmentId; this.extension = config.outputExtension; if (writer.getOptions() != null) { this.partitionColumns = writer.getOptions().getPartitionColumns(); this.isIcebergWriter = (writer.getOptions().getIcebergWriterOperation() != WriterOptions.IcebergWriterOperation.NONE); } else { this.partitionColumns = null; this.isIcebergWriter = false; } if (this.isIcebergWriter && writer.getOptions().getExtendedProperty() != null) { initIcebergColumnIDList(writer.getOptions().getExtendedProperty()); } memoryThreshold = (int) context.getOptions().getOption(ExecConstants.PARQUET_MEMORY_THRESHOLD_VALIDATOR); blockSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_BLOCK_SIZE_VALIDATOR); pageSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_PAGE_SIZE_VALIDATOR); final String codecName = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE_VALIDATOR).toLowerCase(); switch(codecName) { case "snappy": codec = CompressionCodecName.SNAPPY; break; case "lzo": codec = CompressionCodecName.LZO; break; case "gzip": codec = CompressionCodecName.GZIP; break; case "none": case "uncompressed": codec = CompressionCodecName.UNCOMPRESSED; break; default: throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName)); } enableDictionary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_VALIDATOR); enableDictionaryForBinary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_BINARY_TYPE_VALIDATOR); maxPartitions = context.getOptions().getOption(ExecConstants.PARQUET_MAXIMUM_PARTITIONS_VALIDATOR); minRecordsForFlush = context.getOptions().getOption(ExecConstants.PARQUET_MIN_RECORDS_FOR_FLUSH_VALIDATOR); parquetFileWriteTimeThresholdMilliSecs = (int)context.getOptions().getOption(ExecConstants.PARQUET_WRITE_TIME_THRESHOLD_MILLI_SECS_VALIDATOR); parquetFileWriteIoRateThresholdMbps = context.getOptions().getOption(ExecConstants.PARQUET_WRITE_IO_RATE_THRESHOLD_MBPS_VALIDATOR); }