org.apache.parquet.hadoop.ParquetOutputFormat Java Examples
The following examples show how to use
org.apache.parquet.hadoop.ParquetOutputFormat.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: CompressionConfigUtil.java From presto with Apache License 2.0 | 6 votes |
public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec) { boolean compression = compressionCodec != HiveCompressionCodec.NONE; config.setBoolean(COMPRESSRESULT.varname, compression); config.setBoolean("mapred.output.compress", compression); config.setBoolean(FileOutputFormat.COMPRESS, compression); // For ORC OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name()); // For RCFile and Text if (compressionCodec.getCodec().isPresent()) { config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName()); config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName()); } else { config.unset("mapred.output.compression.codec"); config.unset(FileOutputFormat.COMPRESS_CODEC); } // For Parquet config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name()); // For SequenceFile config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString()); }
Example #2
Source File: TestColumnIndexes.java From parquet-mr with Apache License 2.0 | 6 votes |
public Path write(Path directory) throws IOException { Path file = new Path(directory, "testColumnIndexes_" + this + ".parquet"); Random random = new Random(seed); int recordCount = random.nextInt(MAX_TOTAL_ROWS) + 1; List<Supplier<?>> generators = buildGenerators(recordCount, random); Configuration conf = new Configuration(); ParquetOutputFormat.setColumnIndexTruncateLength(conf, columnIndexTruncateLength); try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file) .withType(SCHEMA) .withPageRowCountLimit(pageRowCountLimit) .withConf(conf) .build()) { for (int i = 0; i < recordCount; i++) { writer.write(createGroup(generators, random)); } } return file; }
Example #3
Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 6 votes |
private static CompressionCodecName getCodec(JobConf conf) { CompressionCodecName codec; if (ParquetOutputFormat.isCompressionSet(conf)) { // explicit parquet config codec = ParquetOutputFormat.getCompression(conf); } else if (getCompressOutput(conf)) { // from hadoop config // find the right codec Class<?> codecClass = getOutputCompressorClass(conf, DefaultCodec.class); LOG.info("Compression set through hadoop codec: " + codecClass.getName()); codec = CompressionCodecName.fromCompressionCodec(codecClass); } else { codec = CompressionCodecName.UNCOMPRESSED; } LOG.info("Compression: " + codec.name()); return codec; }
Example #4
Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
public PentahoApacheOutputFormat( NamedCluster namedCluster ) { logger.info( "We are initializing parquet output format" ); inClassloader( () -> { ConfigurationProxy conf = new ConfigurationProxy(); if ( namedCluster != null ) { // if named cluster is not defined, no need to add cluster resource configs BiConsumer<InputStream, String> consumer = ( is, filename ) -> conf.addResource( is, filename ); ShimConfigsLoader.addConfigsAsResources( namedCluster, consumer ); } job = Job.getInstance( conf ); job.getConfiguration().set( ParquetOutputFormat.ENABLE_JOB_SUMMARY, "false" ); ParquetOutputFormat.setEnableDictionary( job, false ); } ); }
Example #5
Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public void setVersion( VERSION version ) throws Exception { inClassloader( () -> { ParquetProperties.WriterVersion writerVersion; switch ( version ) { case VERSION_1_0: writerVersion = ParquetProperties.WriterVersion.PARQUET_1_0; break; case VERSION_2_0: writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0; break; default: writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0; break; } job.getConfiguration().set( ParquetOutputFormat.WRITER_VERSION, writerVersion.toString() ); } ); }
Example #6
Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
@Override public void setCompression( COMPRESSION comp ) throws Exception { inClassloader( () -> { CompressionCodecName codec; switch ( comp ) { case SNAPPY: codec = CompressionCodecName.SNAPPY; break; case GZIP: codec = CompressionCodecName.GZIP; break; case LZO: codec = CompressionCodecName.LZO; break; default: codec = CompressionCodecName.UNCOMPRESSED; break; } ParquetOutputFormat.setCompression( job, codec ); } ); }
Example #7
Source File: IcebergFileWriterFactory.java From presto with Apache License 2.0 | 5 votes |
private IcebergFileWriter createParquetWriter( Path outputPath, Schema icebergSchema, List<IcebergColumnHandle> columns, JobConf jobConf, ConnectorSession session) { Properties properties = new Properties(); properties.setProperty(IOConstants.COLUMNS, columns.stream() .map(IcebergColumnHandle::getName) .collect(joining(","))); properties.setProperty(IOConstants.COLUMNS_TYPES, columns.stream() .map(column -> toHiveType(column.getType()).getHiveTypeName().toString()) .collect(joining(":"))); setParquetSchema(jobConf, convert(icebergSchema, "table")); jobConf.set(ParquetOutputFormat.COMPRESSION, getCompressionCodec(session).getParquetCompressionCodec().name()); return new IcebergRecordFileWriter( outputPath, columns.stream() .map(IcebergColumnHandle::getName) .collect(toImmutableList()), fromHiveStorageFormat(HiveStorageFormat.PARQUET), properties, HiveStorageFormat.PARQUET.getEstimatedWriterSystemMemoryUsage(), jobConf, typeManager, session); }
Example #8
Source File: MapredParquetOutputFormat.java From parquet-mr with Apache License 2.0 | 5 votes |
protected ParquetRecordWriterWrapper getParquerRecordWriterWrapper( ParquetOutputFormat<ArrayWritable> realOutputFormat, JobConf jobConf, String finalOutPath, Progressable progress ) throws IOException { return new ParquetRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress); }
Example #9
Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@SuppressWarnings("rawtypes") @Override public void sinkConfInit(FlowProcess<? extends JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf); jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema); ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class); }
Example #10
Source File: ParquetScroogeScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public void sinkConfInit(FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf); ParquetOutputFormat.setWriteSupportClass(jobConf, ScroogeWriteSupport.class); ScroogeWriteSupport.setScroogeClass(jobConf, this.config.getKlass()); }
Example #11
Source File: CodecConfigTest.java From parquet-mr with Apache License 2.0 | 5 votes |
public void shouldUseParquetFlagToSetCodec(String codecNameStr, CompressionCodecName expectedCodec) throws IOException { //Test mapreduce API Job job = new Job(); Configuration conf = job.getConfiguration(); conf.set(ParquetOutputFormat.COMPRESSION, codecNameStr); TaskAttemptContext task = ContextUtil.newTaskAttemptContext(conf, new TaskAttemptID(new TaskID(new JobID("test", 1), false, 1), 1)); Assert.assertEquals(CodecConfig.from(task).getCodec(), expectedCodec); //Test mapred API JobConf jobConf = new JobConf(); jobConf.set(ParquetOutputFormat.COMPRESSION, codecNameStr); Assert.assertEquals(CodecConfig.from(jobConf).getCodec(), expectedCodec); }
Example #12
Source File: DeprecatedParquetOutputFormat.java From parquet-mr with Apache License 2.0 | 5 votes |
public RecordWriterWrapper(ParquetOutputFormat<V> realOutputFormat, FileSystem fs, JobConf conf, String name, Progressable progress) throws IOException { CompressionCodecName codec = getCodec(conf); String extension = codec.getExtension() + ".parquet"; Path file = getDefaultWorkFile(conf, name, extension); try { realWriter = (ParquetRecordWriter<V>) realOutputFormat.getRecordWriter(conf, file, codec); } catch (InterruptedException e) { Thread.interrupted(); throw new IOException(e); } }
Example #13
Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0 | 5 votes |
@SuppressWarnings("rawtypes") @Override public void sinkConfInit(FlowProcess<JobConf> fp, Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) { DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf); jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema); ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class); }
Example #14
Source File: ParquetRowDataWriterTest.java From flink with Apache License 2.0 | 5 votes |
@Test public void testCompression() throws IOException { Configuration conf = new Configuration(); conf.set(ParquetOutputFormat.COMPRESSION, "GZIP"); innerTest(conf, true); innerTest(conf, false); }
Example #15
Source File: ParquetAppender.java From tajo with Apache License 2.0 | 5 votes |
/** * Creates a new ParquetAppender. * * @param conf Configuration properties. * @param schema The table schema. * @param meta The table metadata. * @param workDir The path of the Parquet file to write to. */ public ParquetAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, TableMeta meta, Path workDir) throws IOException { super(conf, taskAttemptId, schema, meta, workDir); this.blockSize = Integer.parseInt( meta.getProperty(ParquetOutputFormat.BLOCK_SIZE, StorageConstants.PARQUET_DEFAULT_BLOCK_SIZE)); this.pageSize = Integer.parseInt( meta.getProperty(ParquetOutputFormat.PAGE_SIZE, StorageConstants.PARQUET_DEFAULT_PAGE_SIZE)); this.compressionCodecName = CompressionCodecName.fromConf( meta.getProperty(ParquetOutputFormat.COMPRESSION, StorageConstants.PARQUET_DEFAULT_COMPRESSION_CODEC_NAME)); this.enableDictionary = Boolean.parseBoolean( meta.getProperty(ParquetOutputFormat.ENABLE_DICTIONARY, StorageConstants.PARQUET_DEFAULT_IS_DICTIONARY_ENABLED)); this.validating = Boolean.parseBoolean( meta.getProperty(ParquetOutputFormat.VALIDATION, StorageConstants.PARQUET_DEFAULT_IS_VALIDATION_ENABLED)); }
Example #16
Source File: ParquetHdfsFileSink.java From components with Apache License 2.0 | 5 votes |
@Override protected void configure(Job job, KV<Void, IndexedRecord> sample) { super.configure(job, sample); IndexedRecord record = (IndexedRecord) sample.getValue(); AvroWriteSupport.setSchema(job.getConfiguration(), record.getSchema()); ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY); }
Example #17
Source File: ParquetFileWriterFactory.java From presto with Apache License 2.0 | 5 votes |
private static CompressionCodecName getCompression(JobConf configuration) { String compressionName = configuration.get(ParquetOutputFormat.COMPRESSION); if (compressionName == null) { return CompressionCodecName.GZIP; } return CompressionCodecName.valueOf(compressionName); }
Example #18
Source File: ParquetRecordWriterUtil.java From presto with Apache License 2.0 | 5 votes |
private static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties) throws IOException { if (conf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA) == null) { List<String> columnNames = Splitter.on(',').splitToList(properties.getProperty(IOConstants.COLUMNS)); List<TypeInfo> columnTypes = getTypeInfosFromTypeString(properties.getProperty(IOConstants.COLUMNS_TYPES)); MessageType schema = HiveSchemaConverter.convert(columnNames, columnTypes); setParquetSchema(conf, schema); } ParquetOutputFormat<ParquetHiveRecord> outputFormat = new ParquetOutputFormat<>(new DataWritableWriteSupport()); return new ParquetRecordWriterWrapper(outputFormat, conf, target.toString(), Reporter.NULL, properties); }
Example #19
Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 4 votes |
@Override public void enableDictionary( boolean useDictionary ) throws Exception { inClassloader( () -> ParquetOutputFormat.setEnableDictionary( job, useDictionary ) ); }
Example #20
Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 4 votes |
@Override public void setRowGroupSize( int size ) throws Exception { inClassloader( () -> ParquetOutputFormat.setBlockSize( job, size ) ); }
Example #21
Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
public static void setEnableDictionary(Configuration configuration, boolean enableDictionary) { configuration.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, enableDictionary); }
Example #22
Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
public static void setCompression(Configuration configuration, CompressionCodecName compression) { configuration.set(ParquetOutputFormat.COMPRESSION, compression.name()); }
Example #23
Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 4 votes |
@Override public void setDataPageSize( int size ) throws Exception { inClassloader( () -> ParquetOutputFormat.setPageSize( job, size ) ); }
Example #24
Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
public static void setPageSize(Configuration configuration, int pageSize) { configuration.setInt(ParquetOutputFormat.PAGE_SIZE, pageSize); }
Example #25
Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
public static void setBlockSize(Configuration configuration, int blockSize) { configuration.setInt(ParquetOutputFormat.BLOCK_SIZE, blockSize); }
Example #26
Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
public static void setWriteSupportClass(Configuration configuration, Class<?> writeSupportClass) { configuration.set(ParquetOutputFormat.WRITE_SUPPORT_CLASS, writeSupportClass.getName()); }
Example #27
Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0 | 4 votes |
@Override public void setDictionaryPageSize( int size ) throws Exception { inClassloader( () -> ParquetOutputFormat.setDictionaryPageSize( job, size ) ); }
Example #28
Source File: ParquetStorer.java From parquet-mr with Apache License 2.0 | 4 votes |
/** * {@inheritDoc} */ @Override public OutputFormat<Void, Tuple> getOutputFormat() throws IOException { Schema pigSchema = getSchema(); return new ParquetOutputFormat<Tuple>(new TupleWriteSupport(pigSchema)); }
Example #29
Source File: TestMapredParquetOutputFormat.java From parquet-mr with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") @Test public void testConstructorWithFormat() { new MapredParquetOutputFormat((ParquetOutputFormat<ArrayWritable>) mock(ParquetOutputFormat.class)); }
Example #30
Source File: MapredParquetOutputFormat.java From parquet-mr with Apache License 2.0 | 4 votes |
public MapredParquetOutputFormat(final OutputFormat<Void, ArrayWritable> mapreduceOutputFormat) { realOutputFormat = (ParquetOutputFormat<ArrayWritable>) mapreduceOutputFormat; }