Java Code Examples for org.apache.parquet.hadoop.ParquetWriter#Builder
The following examples show how to use
org.apache.parquet.hadoop.ParquetWriter#Builder .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: AvroParquetConvertMapper.java From datacollector with Apache License 2.0 | 5 votes |
@Override protected void initializeWriter( Path tempFile, Schema avroSchema, Configuration conf, Context context ) throws IOException { ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter(tempFile, avroSchema, conf); // Parquet writer parquetWriter = builder .withConf(context.getConfiguration()) .build(); }
Example 2
Source File: AvroToParquetConverterUtil.java From datacollector with Apache License 2.0 | 5 votes |
private static ParquetWriter.Builder getParquetWriterBuilder(Path tempFile, Schema avroSchema, Configuration conf) { // Parquet Avro pre-1.9 doesn't work with logical types, so in that case we use custom Builder that injects our own // avro schema -> parquet schema generator class (which is a copy of the one that was provided in PARQUET-358). // Additionally, Parquet Avro 1.9.x does not support converting from Avro timestamps (logical types TIMESTAMP_MILLIS // and TIMESTAMP_MICROS) and so we have to extend Parquet Avro classes to support timestamps conversion. ParquetWriter.Builder builder = null; try { SemanticVersion parquetVersion = SemanticVersion.parse(Version.VERSION_NUMBER); if(parquetVersion.major > 1 || (parquetVersion.major == 1 && parquetVersion.minor >= 9)) { if (parquetVersion.major == 1 && parquetVersion.minor >= 9) { LOG.debug("Creating AvroParquetWriterBuilder190Int96"); if (propertyDefined(conf, AvroParquetConstants.TIMEZONE)) { String timeZoneId = conf.get(AvroParquetConstants.TIMEZONE); builder = new AvroParquetWriterBuilder190Int96(tempFile, timeZoneId).withSchema(avroSchema); } else { builder = new AvroParquetWriterBuilder190Int96(tempFile).withSchema(avroSchema); } } else { LOG.debug("Creating AvroParquetWriter.builder"); builder = AvroParquetWriter.builder(tempFile).withSchema(avroSchema); } } else { LOG.debug("Creating AvroParquetWriterBuilder"); builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema); } } catch (SemanticVersion.SemanticVersionParseException e) { LOG.warn("Can't parse parquet version string: " + Version.VERSION_NUMBER, e); builder = new AvroParquetWriterBuilder(tempFile).withSchema(avroSchema); } return builder; }
Example 3
Source File: PhoneBookWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
public static void write(ParquetWriter.Builder<Group, ?> builder, List<User> users) throws IOException { builder.config(GroupWriteSupport.PARQUET_EXAMPLE_SCHEMA, schema.toString()); try (ParquetWriter<Group> writer = builder.build()) { for (User u : users) { writer.write(groupFromUser(u)); } } }
Example 4
Source File: WholeFileTransformerProcessor.java From datacollector with Apache License 2.0 | 4 votes |
/** * Convert Avro record to Parquet * @param sourceFileName the source Avro file name * @param fileReader the {@link org.apache.avro.file.DataFileStream} Avro file reader * @param tempParquetFile the {@link java.nio.file.Path} temporary parquet file path */ private void writeParquet(String sourceFileName, DataFileStream<GenericRecord> fileReader, Path tempParquetFile) throws StageException { long recordCount = 0; GenericRecord avroRecord; Schema schema = fileReader.getSchema(); LOG.debug("Start reading input file : {}", sourceFileName); try { // initialize parquet writer Configuration jobConfiguration = new Configuration(); String compressionCodecName = compressionElEval.eval(variables, jobConfig.avroParquetConfig.compressionCodec, String.class); jobConfiguration.set(AvroParquetConstants.COMPRESSION_CODEC_NAME, compressionCodecName); jobConfiguration.setInt(AvroParquetConstants.ROW_GROUP_SIZE, jobConfig.avroParquetConfig.rowGroupSize); jobConfiguration.setInt(AvroParquetConstants.PAGE_SIZE, jobConfig.avroParquetConfig.pageSize); jobConfiguration.setInt(AvroParquetConstants.DICTIONARY_PAGE_SIZE, jobConfig.avroParquetConfig.dictionaryPageSize); jobConfiguration.setInt(AvroParquetConstants.MAX_PADDING_SIZE, jobConfig.avroParquetConfig.maxPaddingSize); // Parquet writer ParquetWriter.Builder builder = AvroToParquetConverterUtil.initializeWriter( new org.apache.hadoop.fs.Path(tempParquetFile.toString()), schema, jobConfiguration ); parquetWriter = builder.build(); while (fileReader.hasNext()) { avroRecord = fileReader.next(); parquetWriter.write(avroRecord); recordCount++; } parquetWriter.close(); } catch (IOException ex) { throw new TransformerStageCheckedException( Errors.CONVERT_08, sourceFileName, recordCount, ex ); } LOG.debug("Finished writing {} records to {}", recordCount, tempParquetFile.getFileName()); }
Example 5
Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public <T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder) { return builder .withPageSize(Integer.MAX_VALUE) // Ensure that only the row count limit takes into account .withPageRowCountLimit(1_000); }
Example 6
Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public <T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder) { return builder .withPageSize(Integer.MAX_VALUE) // Ensure that only the row count limit takes into account .withPageRowCountLimit(10_000); }
Example 7
Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public <T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder) { return builder .withPageSize(Integer.MAX_VALUE) // Ensure that only the row count limit takes into account .withPageRowCountLimit(50_000); }
Example 8
Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public <T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder) { return builder .withPageSize(Integer.MAX_VALUE) // Ensure that only the row count limit takes into account .withPageRowCountLimit(100_000); }
Example 9
Source File: ParquetUtils.java From nifi with Apache License 2.0 | 4 votes |
public static void applyCommonConfig(final ParquetWriter.Builder<?, ?> builder, final Configuration conf, final ParquetConfig parquetConfig) { builder.withConf(conf); builder.withCompressionCodec(parquetConfig.getCompressionCodec()); // Optional properties if (parquetConfig.getRowGroupSize() != null){ builder.withRowGroupSize(parquetConfig.getRowGroupSize()); } if (parquetConfig.getPageSize() != null) { builder.withPageSize(parquetConfig.getPageSize()); } if (parquetConfig.getDictionaryPageSize() != null) { builder.withDictionaryPageSize(parquetConfig.getDictionaryPageSize()); } if (parquetConfig.getMaxPaddingSize() != null) { builder.withMaxPaddingSize(parquetConfig.getMaxPaddingSize()); } if (parquetConfig.getEnableDictionaryEncoding() != null) { builder.withDictionaryEncoding(parquetConfig.getEnableDictionaryEncoding()); } if (parquetConfig.getEnableValidation() != null) { builder.withValidation(parquetConfig.getEnableValidation()); } if (parquetConfig.getWriterVersion() != null) { builder.withWriterVersion(parquetConfig.getWriterVersion()); } if (parquetConfig.getWriterMode() != null) { builder.withWriteMode(parquetConfig.getWriterMode()); } applyCommonConfig(conf, parquetConfig); }
Example 10
Source File: FilteringBenchmarks.java From parquet-mr with Apache License 2.0 | votes |
<T> ParquetWriter.Builder<T, ?> configureBuilder(ParquetWriter.Builder<T, ?> builder);