org.apache.parquet.hadoop.metadata.CompressionCodecName#UNCOMPRESSED

Source File: AsyncPageReader.java From Bats with Apache License 2.0

6 votes

private DrillBuf getDecompressedPageData(ReadStatus readStatus) {
  DrillBuf data;
  boolean isDictionary = false;
  synchronized (this) {
    data = readStatus.getPageData();
    readStatus.setPageData(null);
    isDictionary = readStatus.isDictionaryPage;
  }
  if (parentColumnReader.columnChunkMetaData.getCodec() != CompressionCodecName.UNCOMPRESSED) {
    DrillBuf compressedData = data;
    data = decompress(readStatus.getPageHeader(), compressedData);
    synchronized (this) {
      readStatus.setPageData(null);
    }
    compressedData.release();
  } else {
    if (isDictionary) {
      stats.totalDictPageReadBytes.addAndGet(readStatus.bytesRead);
    } else {
      stats.totalDataPageReadBytes.addAndGet(readStatus.bytesRead);
    }
  }
  return data;
}

Source File: ParquetAsTextOutputFormat.java From iow-hadoop-streaming with Apache License 2.0

6 votes

private static CompressionCodecName getCodec(JobConf conf) {

        CompressionCodecName codec;

        if (ParquetOutputFormat.isCompressionSet(conf)) { // explicit parquet config
            codec = ParquetOutputFormat.getCompression(conf);
        } else if (getCompressOutput(conf)) { // from hadoop config
            // find the right codec
            Class<?> codecClass = getOutputCompressorClass(conf, DefaultCodec.class);
            LOG.info("Compression set through hadoop codec: " + codecClass.getName());
            codec = CompressionCodecName.fromCompressionCodec(codecClass);
        } else {
            codec = CompressionCodecName.UNCOMPRESSED;
        }

        LOG.info("Compression: " + codec.name());
        return codec;
    }

Source File: SqlInterpreterTest.java From zeppelin with Apache License 2.0

6 votes

public File createParquetFile(int[] values,
                              ParquetProperties.WriterVersion version) throws IOException {
  File file = File.createTempFile("zeppelin-flink-input", ".par");
  file.delete();
  Path path = new Path(file.getAbsolutePath());
  Configuration conf = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType(
          "message test { "
                  + "required int32 int32_field; "
                  + "} ");
  GroupWriteSupport.setSchema(schema, conf);
  SimpleGroupFactory f = new SimpleGroupFactory(schema);

  ParquetWriter<Group> writer = new ParquetWriter<Group>(
          path,
          new GroupWriteSupport(),
          CompressionCodecName.UNCOMPRESSED, 1024, 1024, 512, true, false, version, conf);
  for (int i = 0; i < values.length; i++) {
    writer.write(f.newGroup()
            .append("int32_field", values[i]));
  }
  writer.close();
  return file;
}

Source File: TestStatistics.java From parquet-mr with Apache License 2.0

6 votes

public static void writeAndTest(WriteContext context) throws IOException {
  // Create the configuration, and then apply the schema to our configuration.
  Configuration configuration = new Configuration();
  GroupWriteSupport.setSchema(context.schema, configuration);
  GroupWriteSupport groupWriteSupport = new GroupWriteSupport();

  // Create the writer properties
  final int blockSize = context.blockSize;
  final int pageSize = context.pageSize;
  final int dictionaryPageSize = pageSize;
  final boolean enableDictionary = context.enableDictionary;
  final boolean enableValidation = context.enableValidation;
  ParquetProperties.WriterVersion writerVersion = context.version;
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  ParquetWriter<Group> writer = new ParquetWriter<Group>(context.fsPath,
      groupWriteSupport, codec, blockSize, pageSize, dictionaryPageSize,
      enableDictionary, enableValidation, writerVersion, configuration);

  context.write(writer);
  writer.close();

  context.test();

  context.path.delete();
}

Source File: ParquetAppender.java From kite with Apache License 2.0

5 votes

@Override
public void open() throws IOException {
  CompressionCodecName codecName = CompressionCodecName.UNCOMPRESSED;
  if (enableCompression) {
    codecName = getCompressionCodecName();
  }
  avroParquetWriter = new AvroParquetWriter<E>(fileSystem.makeQualified(path),
      schema, codecName, DEFAULT_ROW_GROUP_SIZE,
      ParquetWriter.DEFAULT_PAGE_SIZE,
      ParquetWriter.DEFAULT_IS_DICTIONARY_ENABLED, conf);
}

Source File: TestSpecificReadWrite.java From parquet-mr with Apache License 2.0

5 votes

@Test
public void testFilterMatchesFinalBlockOnly() throws IOException {
  File tmp = File.createTempFile(getClass().getSimpleName(), ".tmp");
  tmp.deleteOnExit();
  tmp.delete();
  Path path = new Path(tmp.getPath());

  Car vwPolo   = getVwPolo();
  Car vwPassat = getVwPassat();
  Car bmwMini  = getBmwMini();

  try(ParquetWriter<Car> writer = new AvroParquetWriter<Car>(path, Car.SCHEMA$,
      CompressionCodecName.UNCOMPRESSED, DEFAULT_BLOCK_SIZE/128, DEFAULT_PAGE_SIZE/128,
      false)) {
    for (int i = 0; i < 10000; i++) {
      writer.write(vwPolo);
      writer.write(vwPassat);
      writer.write(vwPolo);
    }
    writer.write(bmwMini); // only write BMW in last block
  }

  try(ParquetReader<Car> reader = new AvroParquetReader<Car>(testConf, path, column("make",
      equalTo("BMW")))) {
    assertEquals(getBmwMini().toString(), reader.read().toString());
    assertNull(reader.read());
  }
}

Source File: ParquetFileTest.java From parquet-mr with Apache License 2.0

5 votes

private void createTestParquetFile() throws IOException {
  File file = parquetFile();
  Path fsPath = new Path(file.getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    for (int i = 0; i < 10; i++) {
      final byte[] bytes = new byte[12];
      ThreadLocalRandom.current().nextBytes(bytes);

      writer.write(fact.newGroup()
       .append(INT32_FIELD, 32 + i)
       .append(INT64_FIELD, 64L + i)
       .append(FLOAT_FIELD, 1.0f + i)
       .append(DOUBLE_FIELD, 2.0d + i)
       .append(BINARY_FIELD, Binary.fromString(COLORS[i % COLORS.length]))
       .append(FIXED_LEN_BYTE_ARRAY_FIELD,
         Binary.fromConstantByteArray(bytes)));
    }
  }
}

Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0

5 votes

private static ParquetMetadata createParquetMetaData(Encoding dicEncoding,
  Encoding dataEncoding) {
  MessageType schema =
    parseMessageType("message schema { optional int32 col (INT_32); }");
  org.apache.parquet.hadoop.metadata.FileMetaData fileMetaData =
    new org.apache.parquet.hadoop.metadata.FileMetaData(schema,
      new HashMap<String, String>(), null);
  List<BlockMetaData> blockMetaDataList = new ArrayList<BlockMetaData>();
  BlockMetaData blockMetaData = new BlockMetaData();
  EncodingStats.Builder builder = new EncodingStats.Builder();
  if (dicEncoding!= null) {
    builder.addDictEncoding(dicEncoding).build();
  }
  builder.addDataEncoding(dataEncoding);
  EncodingStats es = builder.build();
  Set<org.apache.parquet.column.Encoding> e =
    new HashSet<org.apache.parquet.column.Encoding>();
  PrimitiveTypeName t = PrimitiveTypeName.INT32;
  ColumnPath p = ColumnPath.get("col");
  CompressionCodecName c = CompressionCodecName.UNCOMPRESSED;
  BinaryStatistics s = new BinaryStatistics();
  ColumnChunkMetaData md =
    ColumnChunkMetaData.get(p, t, c, es, e, s, 20, 30, 0, 0, 0);
  blockMetaData.addColumn(md);
  blockMetaDataList.add(blockMetaData);
  return new ParquetMetadata(fileMetaData, blockMetaDataList);
}

Source File: TestInputFormat.java From parquet-mr with Apache License 2.0

5 votes

private void createParquetFile(File file) throws IOException {
  Path path = new Path(file.toURI());
  Configuration configuration = new Configuration();

  MessageType schema = MessageTypeParser.parseMessageType("message m { required group a {required binary b;}}");
  String[] columnPath = {"a", "b"};
  ColumnDescriptor c1 = schema.getColumnDescription(columnPath);

  byte[] bytes1 = { 0, 1, 2, 3};
  byte[] bytes2 = { 2, 3, 4, 5};
  CompressionCodecName codec = CompressionCodecName.UNCOMPRESSED;

  BinaryStatistics stats = new BinaryStatistics();

  ParquetFileWriter w = new ParquetFileWriter(configuration, schema, path);
  w.start();
  w.startBlock(3);
  w.startColumn(c1, 5, codec);
  w.writeDataPage(2, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.writeDataPage(3, 4, BytesInput.from(bytes1), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.startBlock(4);
  w.startColumn(c1, 7, codec);
  w.writeDataPage(7, 4, BytesInput.from(bytes2), stats, BIT_PACKED, BIT_PACKED, PLAIN);
  w.endColumn();
  w.endBlock();
  w.end(new HashMap<String, String>());
}

Source File: CodecConfig.java From parquet-mr with Apache License 2.0

5 votes

public CompressionCodecName getCodec() {
  CompressionCodecName codec;
  Configuration configuration = getConfiguration();
  if (isParquetCompressionSet(configuration)) { // explicit parquet config
    codec = getParquetCompressionCodec(configuration);
  } else if (isHadoopCompressionSet()) { // from hadoop config
    codec = getHadoopCompressionCodec();
  } else {
    LOG.info("Compression set to false");
    codec = CompressionCodecName.UNCOMPRESSED;
  }

  LOG.info("Compression: {}", codec.name());
  return codec;
}

Source File: TestSimpleRecordConverter.java From parquet-mr with Apache License 2.0

5 votes

private void createTestParquetFile() throws IOException {
  Path fsPath = new Path(testFile().getPath());
  Configuration conf = new Configuration();

  MessageType schema = createSchema();
  SimpleGroupFactory fact = new SimpleGroupFactory(schema);
  GroupWriteSupport.setSchema(schema, conf);

  try (
    ParquetWriter<Group> writer = new ParquetWriter<>(
      fsPath,
      new GroupWriteSupport(),
      CompressionCodecName.UNCOMPRESSED,
      1024,
      1024,
      512,
      true,
      false,
      ParquetProperties.WriterVersion.PARQUET_2_0,
      conf)) {
    writer.write(fact.newGroup()
     .append(INT32_FIELD, 32)
     .append(INT64_FIELD, 64L)
     .append(FLOAT_FIELD, 1.0f)
     .append(DOUBLE_FIELD, 2.0d)
     .append(BINARY_FIELD, Binary.fromString("foobar"))
     .append(FIXED_LEN_BYTE_ARRAY_FIELD,
       Binary.fromConstantByteArray(new byte[]{1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12 })));
  }
}

Source File: PageReader.java From dremio-oss with Apache License 2.0

5 votes

public void readPage(PageHeader pageHeader, int compressedSize, int uncompressedSize, ArrowBuf dest) throws IOException {
  Stopwatch timer = Stopwatch.createUnstarted();
  long timeToRead;
  long start = inputStream.getPos();
  if (parentColumnReader.columnChunkMetaData.getCodec() == CompressionCodecName.UNCOMPRESSED) {
    timer.start();
    dataReader.loadPage(dest, compressedSize);
    timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
    this.updateStats(pageHeader, "Page Read", start, timeToRead, compressedSize, uncompressedSize);
  } else {
    final ArrowBuf compressedData = allocateTemporaryBuffer(compressedSize);
    try {
      timer.start();
      dataReader.loadPage(compressedData, compressedSize);
      timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
      timer.reset();
      this.updateStats(pageHeader, "Page Read", start, timeToRead, compressedSize, compressedSize);
      start = inputStream.getPos();
      timer.start();
      codecFactory.getDecompressor(parentColumnReader.columnChunkMetaData
        .getCodec()).decompress(compressedData.nioBuffer(0, compressedSize), compressedSize,
        dest.nioBuffer(0, uncompressedSize), uncompressedSize);
      timeToRead = timer.elapsed(TimeUnit.MICROSECONDS);
      this.updateStats(pageHeader, "Decompress", start, timeToRead, compressedSize, uncompressedSize);
    } finally {
      compressedData.release();
    }
  }
}

Source File: DirectCodecFactory.java From parquet-mr with Apache License 2.0

4 votes

@Override
public CompressionCodecName getCodecName() {
  return CompressionCodecName.UNCOMPRESSED;
}

Source File: ParquetFilePOJOReaderTest.java From attic-apex-malhar with Apache License 2.0

4 votes

public ParquetPOJOWriter(Path file, MessageType schema, Class klass, boolean enableDictionary) throws IOException
{
  this(file, schema, klass, CompressionCodecName.UNCOMPRESSED, enableDictionary);
}

Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0

4 votes

public ParquetRecordWriter(OperatorContext context, ParquetWriter writer, ParquetFormatConfig config) throws OutOfMemoryException{
  this.context = context;
  this.codecAllocator = context.getAllocator().newChildAllocator("ParquetCodecFactory", 0, Long.MAX_VALUE);
  this.columnEncoderAllocator = context.getAllocator().newChildAllocator("ParquetColEncoder", 0, Long.MAX_VALUE);
  this.codecFactory = CodecFactory.createDirectCodecFactory(new Configuration(),
      new ParquetDirectByteBufferAllocator(codecAllocator), pageSize);
  this.extraMetaData.put(DREMIO_VERSION_PROPERTY, DremioVersionInfo.getVersion());
  this.extraMetaData.put(IS_DATE_CORRECT_PROPERTY, "true");

  this.plugin = writer.getFormatPlugin().getFsPlugin();
  this.queryUser = writer.getProps().getUserName();

  FragmentHandle handle = context.getFragmentHandle();
  String fragmentId = String.format("%d_%d", handle.getMajorFragmentId(), handle.getMinorFragmentId());

  this.location = writer.getLocation();
  this.prefix = fragmentId;
  this.extension = config.outputExtension;
  if (writer.getOptions() != null) {
    this.partitionColumns = writer.getOptions().getPartitionColumns();
    this.isIcebergWriter = (writer.getOptions().getIcebergWriterOperation() != WriterOptions.IcebergWriterOperation.NONE);
  } else {
    this.partitionColumns = null;
    this.isIcebergWriter = false;
  }

  if (this.isIcebergWriter && writer.getOptions().getExtendedProperty() != null) {
    initIcebergColumnIDList(writer.getOptions().getExtendedProperty());
  }

  memoryThreshold = (int) context.getOptions().getOption(ExecConstants.PARQUET_MEMORY_THRESHOLD_VALIDATOR);
  blockSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_BLOCK_SIZE_VALIDATOR);
  pageSize = (int) context.getOptions().getOption(ExecConstants.PARQUET_PAGE_SIZE_VALIDATOR);
  final String codecName = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE_VALIDATOR).toLowerCase();
  switch(codecName) {
  case "snappy":
    codec = CompressionCodecName.SNAPPY;
    break;
  case "lzo":
    codec = CompressionCodecName.LZO;
    break;
  case "gzip":
    codec = CompressionCodecName.GZIP;
    break;
  case "none":
  case "uncompressed":
    codec = CompressionCodecName.UNCOMPRESSED;
    break;
  default:
    throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
  }

  enableDictionary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_VALIDATOR);
  enableDictionaryForBinary = context.getOptions().getOption(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING_BINARY_TYPE_VALIDATOR);
  maxPartitions = context.getOptions().getOption(ExecConstants.PARQUET_MAXIMUM_PARTITIONS_VALIDATOR);
  minRecordsForFlush = context.getOptions().getOption(ExecConstants.PARQUET_MIN_RECORDS_FOR_FLUSH_VALIDATOR);
  parquetFileWriteTimeThresholdMilliSecs = (int)context.getOptions().getOption(ExecConstants.PARQUET_WRITE_TIME_THRESHOLD_MILLI_SECS_VALIDATOR);
  parquetFileWriteIoRateThresholdMbps = context.getOptions().getOption(ExecConstants.PARQUET_WRITE_IO_RATE_THRESHOLD_MBPS_VALIDATOR);
}

Source File: ParquetRecordWriter.java From Bats with Apache License 2.0

4 votes

@Override
public void init(Map<String, String> writerOptions) throws IOException {
  this.location = writerOptions.get("location");
  this.prefix = writerOptions.get("prefix");

  fs = FileSystem.get(conf);
  blockSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_BLOCK_SIZE));
  pageSize = Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_PAGE_SIZE));
  dictionaryPageSize= Integer.parseInt(writerOptions.get(ExecConstants.PARQUET_DICT_PAGE_SIZE));
  String codecName = writerOptions.get(ExecConstants.PARQUET_WRITER_COMPRESSION_TYPE).toLowerCase();
  switch(codecName) {
  case "snappy":
    codec = CompressionCodecName.SNAPPY;
    break;
  case "lzo":
    codec = CompressionCodecName.LZO;
    break;
  case "gzip":
    codec = CompressionCodecName.GZIP;
    break;
  case "none":
  case "uncompressed":
    codec = CompressionCodecName.UNCOMPRESSED;
    break;
  default:
    throw new UnsupportedOperationException(String.format("Unknown compression type: %s", codecName));
  }

  String logicalTypeNameForDecimals = writerOptions.get(ExecConstants.PARQUET_WRITER_LOGICAL_TYPE_FOR_DECIMALS).toLowerCase();
  switch (logicalTypeNameForDecimals) {
    case "fixed_len_byte_array":
      logicalTypeForDecimals = PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY;
      break;
    case "binary":
      logicalTypeForDecimals = PrimitiveTypeName.BINARY;
      break;
    default:
      throw new UnsupportedOperationException(
          String.format(
              "Unsupported logical type for decimals: %s\n" +
              "Supported types: ['fixed_len_byte_array', 'binary']", codecName));
  }

  enableDictionary = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_ENABLE_DICTIONARY_ENCODING));
  useSingleFSBlock = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_SINGLE_FS_BLOCK));
  usePrimitiveTypesForDecimals = Boolean.parseBoolean(writerOptions.get(ExecConstants.PARQUET_WRITER_USE_PRIMITIVE_TYPES_FOR_DECIMALS));

  if (useSingleFSBlock) {
    // Round up blockSize to multiple of 64K.
    blockSize = (int)ceil((double)blockSize/BLOCKSIZE_MULTIPLE) * BLOCKSIZE_MULTIPLE;
  }
}

Source File: PageReader.java From Bats with Apache License 2.0

4 votes

private DrillBuf readPage(PageHeader pageHeader, int compressedSize, int uncompressedSize) throws IOException {
  DrillBuf pageDataBuf = null;
  Stopwatch timer = Stopwatch.createUnstarted();
  long timeToRead;
  long start=dataReader.getPos();
  if (parentColumnReader.columnChunkMetaData.getCodec() == CompressionCodecName.UNCOMPRESSED) {
    timer.start();
    pageDataBuf = dataReader.getNext(compressedSize);
    if (logger.isTraceEnabled()) {
      logger.trace("PageReaderTask==> Col: {}  readPos: {}  Uncompressed_size: {}  pageData: {}",
          parentColumnReader.columnChunkMetaData.toString(), dataReader.getPos(),
          pageHeader.getUncompressed_page_size(), ByteBufUtil.hexDump(pageData));
    }
    timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
    this.updateStats(pageHeader, "Page Read", start, timeToRead, compressedSize, uncompressedSize);
  } else {
    DrillBuf compressedData = null;
    pageDataBuf=allocateTemporaryBuffer(uncompressedSize);

    try {
      timer.start();
      compressedData = dataReader.getNext(compressedSize);
      timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);

      timer.reset();
      this.updateStats(pageHeader, "Page Read", start, timeToRead, compressedSize, compressedSize);
      start = dataReader.getPos();
      timer.start();
      codecFactory.getDecompressor(parentColumnReader.columnChunkMetaData.getCodec())
          .decompress(compressedData.nioBuffer(0, compressedSize), compressedSize,
              pageDataBuf.nioBuffer(0, uncompressedSize), uncompressedSize);
      pageDataBuf.writerIndex(uncompressedSize);
      timeToRead = timer.elapsed(TimeUnit.NANOSECONDS);
      this.updateStats(pageHeader, "Decompress", start, timeToRead, compressedSize, uncompressedSize);
    } finally {
      if (compressedData != null) {
        compressedData.release();
      }
    }
  }
  return pageDataBuf;
}

Source File: TajoParquetWriter.java From tajo with Apache License 2.0

3 votes

/**
 * Creates a new TajoParquetWriter. The default block size is 128 MB.
 * The default page size is 1 MB. Default compression is no compression.
 *
 * @param file The Path of the file to write to.
 * @param schema The Tajo schema of the table.
 * @throws java.io.IOException
 */
public TajoParquetWriter(Path file, Schema schema) throws IOException {
  this(file,
       schema,
       CompressionCodecName.UNCOMPRESSED,
       DEFAULT_BLOCK_SIZE,
       DEFAULT_PAGE_SIZE);
}

Source File: ProtoParquetWriter.java From parquet-mr with Apache License 2.0

2 votes

/**
 * Create a new {@link ProtoParquetWriter}. The default block size is 50 MB.The default
 * page size is 1 MB.  Default compression is no compression. (Inherited from {@link ParquetWriter})
 *
 * @param file The file name to write to.
 * @param protoMessage         Protobuf message class
 * @throws IOException if there is an error while writing
 */
public ProtoParquetWriter(Path file, Class<? extends Message> protoMessage) throws IOException {
  this(file, protoMessage, CompressionCodecName.UNCOMPRESSED,
          DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
}

Source File: AvroParquetWriter.java From parquet-mr with Apache License 2.0

2 votes

/** Create a new {@link AvroParquetWriter}. The default block size is 50 MB.The default
 *  page size is 1 MB.  Default compression is no compression. (Inherited from {@link ParquetWriter})
 *
 * @param file The file name to write to.
 * @param avroSchema The schema to write with.
 * @throws IOException if there is an error while writing
 */
@Deprecated
public AvroParquetWriter(Path file, Schema avroSchema) throws IOException {
  this(file, avroSchema, CompressionCodecName.UNCOMPRESSED,
      DEFAULT_BLOCK_SIZE, DEFAULT_PAGE_SIZE);
}

Java Code Examples for org.apache.parquet.hadoop.metadata.CompressionCodecName#UNCOMPRESSED