org.apache.parquet.hadoop.ParquetOutputFormat Java Exaples

Source File: CompressionConfigUtil.java From presto with Apache License 2.0

6 votes

public static void configureCompression(Configuration config, HiveCompressionCodec compressionCodec)
{
    boolean compression = compressionCodec != HiveCompressionCodec.NONE;
    config.setBoolean(COMPRESSRESULT.varname, compression);
    config.setBoolean("mapred.output.compress", compression);
    config.setBoolean(FileOutputFormat.COMPRESS, compression);

    // For ORC
    OrcConf.COMPRESS.setString(config, compressionCodec.getOrcCompressionKind().name());

    // For RCFile and Text
    if (compressionCodec.getCodec().isPresent()) {
        config.set("mapred.output.compression.codec", compressionCodec.getCodec().get().getName());
        config.set(FileOutputFormat.COMPRESS_CODEC, compressionCodec.getCodec().get().getName());
    }
    else {
        config.unset("mapred.output.compression.codec");
        config.unset(FileOutputFormat.COMPRESS_CODEC);
    }

    // For Parquet
    config.set(ParquetOutputFormat.COMPRESSION, compressionCodec.getParquetCompressionCodec().name());

    // For SequenceFile
    config.set(FileOutputFormat.COMPRESS_TYPE, BLOCK.toString());
}

Source File: TestColumnIndexes.java From parquet-mr with Apache License 2.0

6 votes

public Path write(Path directory) throws IOException {
  Path file = new Path(directory, "testColumnIndexes_" + this + ".parquet");
  Random random = new Random(seed);
  int recordCount = random.nextInt(MAX_TOTAL_ROWS) + 1;
  List<Supplier<?>> generators = buildGenerators(recordCount, random);
  Configuration conf = new Configuration();
  ParquetOutputFormat.setColumnIndexTruncateLength(conf, columnIndexTruncateLength);
  try (ParquetWriter<Group> writer = ExampleParquetWriter.builder(file)
      .withType(SCHEMA)
      .withPageRowCountLimit(pageRowCountLimit)
      .withConf(conf)
      .build()) {
    for (int i = 0; i < recordCount; i++) {
      writer.write(createGroup(generators, random));
    }
  }
  return file;
}

Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

6 votes

private static CompressionCodecName getCodec(JobConf conf) {

        CompressionCodecName codec;

        if (ParquetOutputFormat.isCompressionSet(conf)) { // explicit parquet config
            codec = ParquetOutputFormat.getCompression(conf);
        } else if (getCompressOutput(conf)) { // from hadoop config
            // find the right codec
            Class<?> codecClass = getOutputCompressorClass(conf, DefaultCodec.class);
            LOG.info("Compression set through hadoop codec: " + codecClass.getName());
            codec = CompressionCodecName.fromCompressionCodec(codecClass);
        } else {
            codec = CompressionCodecName.UNCOMPRESSED;
        }

        LOG.info("Compression: " + codec.name());
        return codec;
    }

Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

public PentahoApacheOutputFormat( NamedCluster namedCluster ) {
  logger.info( "We are initializing parquet output format" );

  inClassloader( () -> {
    ConfigurationProxy conf = new ConfigurationProxy();

    if ( namedCluster != null ) {
      // if named cluster is not defined, no need to add cluster resource configs
      BiConsumer<InputStream, String> consumer = ( is, filename ) -> conf.addResource( is, filename );
      ShimConfigsLoader.addConfigsAsResources( namedCluster, consumer );
    }

    job = Job.getInstance( conf );

    job.getConfiguration().set( ParquetOutputFormat.ENABLE_JOB_SUMMARY, "false" );
    ParquetOutputFormat.setEnableDictionary( job, false );
  } );
}

Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

@Override
public void setVersion( VERSION version ) throws Exception {
  inClassloader( () -> {
    ParquetProperties.WriterVersion writerVersion;
    switch ( version ) {
      case VERSION_1_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_1_0;
        break;
      case VERSION_2_0:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
      default:
        writerVersion = ParquetProperties.WriterVersion.PARQUET_2_0;
        break;
    }
    job.getConfiguration().set( ParquetOutputFormat.WRITER_VERSION, writerVersion.toString() );
  } );
}

Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0

6 votes

@Override
public void setCompression( COMPRESSION comp ) throws Exception {
  inClassloader( () -> {
    CompressionCodecName codec;
    switch ( comp ) {
      case SNAPPY:
        codec = CompressionCodecName.SNAPPY;
        break;
      case GZIP:
        codec = CompressionCodecName.GZIP;
        break;
      case LZO:
        codec = CompressionCodecName.LZO;
        break;
      default:
        codec = CompressionCodecName.UNCOMPRESSED;
        break;
    }
    ParquetOutputFormat.setCompression( job, codec );
  } );
}

Source File: IcebergFileWriterFactory.java From presto with Apache License 2.0

5 votes

private IcebergFileWriter createParquetWriter(
        Path outputPath,
        Schema icebergSchema,
        List<IcebergColumnHandle> columns,
        JobConf jobConf,
        ConnectorSession session)
{
    Properties properties = new Properties();
    properties.setProperty(IOConstants.COLUMNS, columns.stream()
            .map(IcebergColumnHandle::getName)
            .collect(joining(",")));
    properties.setProperty(IOConstants.COLUMNS_TYPES, columns.stream()
            .map(column -> toHiveType(column.getType()).getHiveTypeName().toString())
            .collect(joining(":")));

    setParquetSchema(jobConf, convert(icebergSchema, "table"));
    jobConf.set(ParquetOutputFormat.COMPRESSION, getCompressionCodec(session).getParquetCompressionCodec().name());

    return new IcebergRecordFileWriter(
            outputPath,
            columns.stream()
                    .map(IcebergColumnHandle::getName)
                    .collect(toImmutableList()),
            fromHiveStorageFormat(HiveStorageFormat.PARQUET),
            properties,
            HiveStorageFormat.PARQUET.getEstimatedWriterSystemMemoryUsage(),
            jobConf,
            typeManager,
            session);
}

Source File: MapredParquetOutputFormat.java From parquet-mr with Apache License 2.0

5 votes

protected ParquetRecordWriterWrapper getParquerRecordWriterWrapper(
    ParquetOutputFormat<ArrayWritable> realOutputFormat,
    JobConf jobConf,
    String finalOutPath,
    Progressable progress
    ) throws IOException {
  return new ParquetRecordWriterWrapper(realOutputFormat, jobConf, finalOutPath.toString(), progress);
}

Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0

5 votes

@SuppressWarnings("rawtypes")
@Override
public void sinkConfInit(FlowProcess<? extends JobConf> fp,
        Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema);
  ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class);
}

Source File: ParquetScroogeScheme.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void sinkConfInit(FlowProcess<JobConf> fp,
    Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  ParquetOutputFormat.setWriteSupportClass(jobConf, ScroogeWriteSupport.class);
  ScroogeWriteSupport.setScroogeClass(jobConf, this.config.getKlass());
}

Source File: CodecConfigTest.java From parquet-mr with Apache License 2.0

5 votes

public void shouldUseParquetFlagToSetCodec(String codecNameStr, CompressionCodecName expectedCodec) throws IOException {

    //Test mapreduce API
    Job job = new Job();
    Configuration conf = job.getConfiguration();
    conf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);
    TaskAttemptContext task = ContextUtil.newTaskAttemptContext(conf, new TaskAttemptID(new TaskID(new JobID("test", 1), false, 1), 1));
    Assert.assertEquals(CodecConfig.from(task).getCodec(), expectedCodec);

    //Test mapred API
    JobConf jobConf = new JobConf();
    jobConf.set(ParquetOutputFormat.COMPRESSION, codecNameStr);
    Assert.assertEquals(CodecConfig.from(jobConf).getCodec(), expectedCodec);
  }

Source File: DeprecatedParquetOutputFormat.java From parquet-mr with Apache License 2.0

5 votes

public RecordWriterWrapper(ParquetOutputFormat<V> realOutputFormat,
    FileSystem fs, JobConf conf, String name, Progressable progress) throws IOException {

  CompressionCodecName codec = getCodec(conf);
  String extension = codec.getExtension() + ".parquet";
  Path file = getDefaultWorkFile(conf, name, extension);

  try {
    realWriter = (ParquetRecordWriter<V>) realOutputFormat.getRecordWriter(conf, file, codec);
  } catch (InterruptedException e) {
    Thread.interrupted();
    throw new IOException(e);
  }
}

Source File: ParquetTupleScheme.java From parquet-mr with Apache License 2.0

5 votes

@SuppressWarnings("rawtypes")
@Override
public void sinkConfInit(FlowProcess<JobConf> fp,
        Tap<JobConf, RecordReader, OutputCollector> tap, JobConf jobConf) {
  DeprecatedParquetOutputFormat.setAsOutputFormat(jobConf);
  jobConf.set(TupleWriteSupport.PARQUET_CASCADING_SCHEMA, parquetSchema);
  ParquetOutputFormat.setWriteSupportClass(jobConf, TupleWriteSupport.class);
}

Source File: ParquetRowDataWriterTest.java From flink with Apache License 2.0

5 votes

@Test
public void testCompression() throws IOException {
	Configuration conf = new Configuration();
	conf.set(ParquetOutputFormat.COMPRESSION, "GZIP");
	innerTest(conf, true);
	innerTest(conf, false);
}

Source File: ParquetAppender.java From tajo with Apache License 2.0

5 votes

/**
 * Creates a new ParquetAppender.
 *
 * @param conf Configuration properties.
 * @param schema The table schema.
 * @param meta The table metadata.
 * @param workDir The path of the Parquet file to write to.
 */
public ParquetAppender(Configuration conf, TaskAttemptId taskAttemptId, Schema schema, TableMeta meta,
                       Path workDir) throws IOException {
  super(conf, taskAttemptId, schema, meta, workDir);
  this.blockSize = Integer.parseInt(
      meta.getProperty(ParquetOutputFormat.BLOCK_SIZE, StorageConstants.PARQUET_DEFAULT_BLOCK_SIZE));
  this.pageSize = Integer.parseInt(
      meta.getProperty(ParquetOutputFormat.PAGE_SIZE, StorageConstants.PARQUET_DEFAULT_PAGE_SIZE));
  this.compressionCodecName = CompressionCodecName.fromConf(
      meta.getProperty(ParquetOutputFormat.COMPRESSION, StorageConstants.PARQUET_DEFAULT_COMPRESSION_CODEC_NAME));
  this.enableDictionary = Boolean.parseBoolean(
      meta.getProperty(ParquetOutputFormat.ENABLE_DICTIONARY, StorageConstants.PARQUET_DEFAULT_IS_DICTIONARY_ENABLED));
  this.validating = Boolean.parseBoolean(
      meta.getProperty(ParquetOutputFormat.VALIDATION, StorageConstants.PARQUET_DEFAULT_IS_VALIDATION_ENABLED));
}

Source File: ParquetHdfsFileSink.java From components with Apache License 2.0

5 votes

@Override
protected void configure(Job job, KV<Void, IndexedRecord> sample) {
    super.configure(job, sample);
    IndexedRecord record = (IndexedRecord) sample.getValue();
    AvroWriteSupport.setSchema(job.getConfiguration(), record.getSchema());
    ParquetOutputFormat.setCompression(job, CompressionCodecName.SNAPPY);
}

Source File: ParquetFileWriterFactory.java From presto with Apache License 2.0

5 votes

private static CompressionCodecName getCompression(JobConf configuration)
{
    String compressionName = configuration.get(ParquetOutputFormat.COMPRESSION);
    if (compressionName == null) {
        return CompressionCodecName.GZIP;
    }
    return CompressionCodecName.valueOf(compressionName);
}

Source File: ParquetRecordWriterUtil.java From presto with Apache License 2.0

5 votes

private static RecordWriter createParquetWriter(Path target, JobConf conf, Properties properties)
        throws IOException
{
    if (conf.get(DataWritableWriteSupport.PARQUET_HIVE_SCHEMA) == null) {
        List<String> columnNames = Splitter.on(',').splitToList(properties.getProperty(IOConstants.COLUMNS));
        List<TypeInfo> columnTypes = getTypeInfosFromTypeString(properties.getProperty(IOConstants.COLUMNS_TYPES));
        MessageType schema = HiveSchemaConverter.convert(columnNames, columnTypes);
        setParquetSchema(conf, schema);
    }

    ParquetOutputFormat<ParquetHiveRecord> outputFormat = new ParquetOutputFormat<>(new DataWritableWriteSupport());

    return new ParquetRecordWriterWrapper(outputFormat, conf, target.toString(), Reporter.NULL, properties);
}

Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0

4 votes

@Override
public void enableDictionary( boolean useDictionary ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setEnableDictionary( job, useDictionary ) );
}

Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0

4 votes

@Override
public void setRowGroupSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setBlockSize( job, size ) );
}

Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

public static void setEnableDictionary(Configuration configuration, boolean enableDictionary) {
    configuration.setBoolean(ParquetOutputFormat.ENABLE_DICTIONARY, enableDictionary);
}

Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

public static void setCompression(Configuration configuration, CompressionCodecName compression) {
    configuration.set(ParquetOutputFormat.COMPRESSION, compression.name());
}

Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0

4 votes

@Override
public void setDataPageSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setPageSize( job, size ) );
}

Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

public static void setPageSize(Configuration configuration, int pageSize) {
    configuration.setInt(ParquetOutputFormat.PAGE_SIZE, pageSize);
}

Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

public static void setBlockSize(Configuration configuration, int blockSize) {
    configuration.setInt(ParquetOutputFormat.BLOCK_SIZE, blockSize);
}

Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

4 votes

public static void setWriteSupportClass(Configuration configuration,  Class<?> writeSupportClass) {
    configuration.set(ParquetOutputFormat.WRITE_SUPPORT_CLASS, writeSupportClass.getName());
}

Source File: PentahoApacheOutputFormat.java From pentaho-hadoop-shims with Apache License 2.0

4 votes

@Override
public void setDictionaryPageSize( int size ) throws Exception {
  inClassloader( () -> ParquetOutputFormat.setDictionaryPageSize( job, size ) );
}

Source File: ParquetStorer.java From parquet-mr with Apache License 2.0

4 votes

/**
 * {@inheritDoc}
 */
@Override
public OutputFormat<Void, Tuple> getOutputFormat() throws IOException {
  Schema pigSchema = getSchema();
  return new ParquetOutputFormat<Tuple>(new TupleWriteSupport(pigSchema));
}

Source File: TestMapredParquetOutputFormat.java From parquet-mr with Apache License 2.0

4 votes

@SuppressWarnings("unchecked")
@Test
public void testConstructorWithFormat() {
  new MapredParquetOutputFormat((ParquetOutputFormat<ArrayWritable>) mock(ParquetOutputFormat.class));
}

Source File: MapredParquetOutputFormat.java From parquet-mr with Apache License 2.0

4 votes

public MapredParquetOutputFormat(final OutputFormat<Void, ArrayWritable> mapreduceOutputFormat) {
  realOutputFormat = (ParquetOutputFormat<ArrayWritable>) mapreduceOutputFormat;
}

org.apache.parquet.hadoop.ParquetOutputFormat Java Examples