Java Code Examples for org.apache.parquet.bytes.BytesInput#writeAllTo()

The following examples show how to use org.apache.parquet.bytes.BytesInput#writeAllTo() . You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetDataOutput.java    From presto with Apache License 2.0 6 votes vote down vote up
static ParquetDataOutput createDataOutput(BytesInput bytesInput)
{
    requireNonNull(bytesInput, "slice is null");
    return new ParquetDataOutput()
    {
        @Override
        public long size()
        {
            return bytesInput.size();
        }

        @Override
        public void writeData(SliceOutput sliceOutput)
        {
            try {
                bytesInput.writeAllTo(sliceOutput);
            }
            catch (IOException e) {
                throw new RuntimeException(e);
            }
        }
    };
}
 
Example 2
Source File: CodecFactory.java    From parquet-mr with Apache License 2.0 6 votes vote down vote up
@Override
public BytesInput compress(BytesInput bytes) throws IOException {
  final BytesInput compressedBytes;
  if (codec == null) {
    compressedBytes = bytes;
  } else {
    compressedOutBuffer.reset();
    if (compressor != null) {
      // null compressor for non-native gzip
      compressor.reset();
    }
    CompressionOutputStream cos = codec.createOutputStream(compressedOutBuffer, compressor);
    bytes.writeAllTo(cos);
    cos.finish();
    cos.close();
    compressedBytes = BytesInput.from(compressedOutBuffer);
  }
  return compressedBytes;
}
 
Example 3
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void writePage(BytesInput bytes,
                      int valueCount,
                      Statistics statistics,
                      Encoding rlEncoding,
                      Encoding dlEncoding,
                      Encoding valuesEncoding) throws IOException {
  long uncompressedSize = bytes.size();
  // Parquet library creates bad metadata if the uncompressed or compressed size of a page exceeds Integer.MAX_VALUE
  if (uncompressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write page larger than Integer.MAX_VALUE bytes: " +
            uncompressedSize);
  }
  BytesInput compressedBytes = compressor.compress(bytes);
  long compressedSize = compressedBytes.size();
  if (compressedSize > Integer.MAX_VALUE) {
    throw new ParquetEncodingException(
        "Cannot write compressed page larger than Integer.MAX_VALUE bytes: "
            + compressedSize);
  }
  parquetMetadataConverter.writeDataPageHeader(
      (int)uncompressedSize,
      (int)compressedSize,
      valueCount,
      statistics,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      buf);
  this.uncompressedLength += uncompressedSize;
  this.compressedLength += compressedSize;
  this.totalValueCount += valueCount;
  this.pageCount += 1;
  this.totalStatistics.mergeStatistics(statistics);
  compressedBytes.writeAllTo(buf);
  rlEncodings.add(rlEncoding);
  dlEncodings.add(dlEncoding);
  dataEncodings.add(valuesEncoding);
}
 
Example 4
Source File: ParquetColumnChunkPageWriteStore.java    From Bats with Apache License 2.0 5 votes vote down vote up
@Override
public void writePageV2(int rowCount,
                        int nullCount,
                        int valueCount,
                        BytesInput repetitionLevels,
                        BytesInput definitionLevels,
                        Encoding dataEncoding,
                        BytesInput data,
                        Statistics<?> statistics) throws IOException {
  int rlByteLength = toIntWithCheck(repetitionLevels.size());
  int dlByteLength = toIntWithCheck(definitionLevels.size());
  int uncompressedSize = toIntWithCheck(
      data.size() + repetitionLevels.size() + definitionLevels.size()
  );
  BytesInput compressedData = compressor.compress(data);
  int compressedSize = toIntWithCheck(
      compressedData.size() + repetitionLevels.size() + definitionLevels.size()
  );
  parquetMetadataConverter.writeDataPageV2Header(
      uncompressedSize, compressedSize,
      valueCount, nullCount, rowCount,
      statistics,
      dataEncoding,
      rlByteLength,
      dlByteLength,
      buf);
  this.uncompressedLength += uncompressedSize;
  this.compressedLength += compressedSize;
  this.totalValueCount += valueCount;
  this.pageCount += 1;
  this.totalStatistics.mergeStatistics(statistics);

  definitionLevels.writeAllTo(buf);
  compressedData.writeAllTo(buf);

  dataEncodings.add(dataEncoding);
}
 
Example 5
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
/**
 * writes a single page
 * @param valueCount count of values
 * @param uncompressedPageSize the size of the data once uncompressed
 * @param bytes the compressed data for the page without header
 * @param rlEncoding encoding of the repetition level
 * @param dlEncoding encoding of the definition level
 * @param valuesEncoding encoding of values
 * @throws IOException if there is an error while writing
 */
@Deprecated
public void writeDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  state = state.write();
  // We are unable to build indexes without rowCount so skip them for this column
  offsetIndexBuilder = OffsetIndexBuilder.getNoOpBuilder();
  columnIndexBuilder = ColumnIndexBuilder.getNoOpBuilder();
  long beforeHeader = out.getPos();
  LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
  int compressedPageSize = (int)bytes.size();
  metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      out);
  long headerSize = out.getPos() - beforeHeader;
  this.uncompressedLength += uncompressedPageSize + headerSize;
  this.compressedLength += compressedPageSize + headerSize;
  LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
  bytes.writeAllTo(out);
  encodingStatsBuilder.addDataEncoding(valuesEncoding);
  currentEncodings.add(rlEncoding);
  currentEncodings.add(dlEncoding);
  currentEncodings.add(valuesEncoding);
}
 
Example 6
Source File: TestColumnChunkPageWriteStore.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private int intValue(BytesInput in) throws IOException {
  ByteArrayOutputStream baos = new ByteArrayOutputStream();
  in.writeAllTo(baos);
  LittleEndianDataInputStream os = new LittleEndianDataInputStream(new ByteArrayInputStream(baos.toByteArray()));
  int i = os.readInt();
  os.close();
  return i;
}
 
Example 7
Source File: TestZstandardCodec.java    From parquet-mr with Apache License 2.0 5 votes vote down vote up
private BytesInput compress(ZstandardCodec codec, BytesInput bytes) throws IOException {
  ByteArrayOutputStream compressedOutBuffer = new ByteArrayOutputStream((int)bytes.size());
  CompressionOutputStream cos = codec.createOutputStream(compressedOutBuffer, null);
  bytes.writeAllTo(cos);
  cos.close();
  return BytesInput.from(compressedOutBuffer);
}
 
Example 8
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
private void innerWriteDataPage(
    int valueCount, int uncompressedPageSize,
    BytesInput bytes,
    Statistics statistics,
    Encoding rlEncoding,
    Encoding dlEncoding,
    Encoding valuesEncoding) throws IOException {
  state = state.write();
  long beforeHeader = out.getPos();
  if (firstPageOffset == -1) {
    firstPageOffset = beforeHeader;
  }
  LOG.debug("{}: write data page: {} values", beforeHeader, valueCount);
  int compressedPageSize = (int) bytes.size();
  if (pageWriteChecksumEnabled) {
    crc.reset();
    crc.update(bytes.toByteArray());
    metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      (int) crc.getValue(),
      out);
  } else {
    metadataConverter.writeDataPageV1Header(
      uncompressedPageSize, compressedPageSize,
      valueCount,
      rlEncoding,
      dlEncoding,
      valuesEncoding,
      out);
  }
  long headerSize = out.getPos() - beforeHeader;
  this.uncompressedLength += uncompressedPageSize + headerSize;
  this.compressedLength += compressedPageSize + headerSize;
  LOG.debug("{}: write data page content {}", out.getPos(), compressedPageSize);
  bytes.writeAllTo(out);

  // Copying the statistics if it is not initialized yet so we have the correct typed one
  if (currentStatistics == null) {
    currentStatistics = statistics.copy();
  } else {
    currentStatistics.mergeStatistics(statistics);
  }

  columnIndexBuilder.add(statistics);

  encodingStatsBuilder.addDataEncoding(valuesEncoding);
  currentEncodings.add(rlEncoding);
  currentEncodings.add(dlEncoding);
  currentEncodings.add(valuesEncoding);
}
 
Example 9
Source File: ParquetFileWriter.java    From parquet-mr with Apache License 2.0 4 votes vote down vote up
/**
 * Writes a column chunk at once
 * @param descriptor the descriptor of the column
 * @param valueCount the value count in this column
 * @param compressionCodecName the name of the compression codec used for compressing the pages
 * @param dictionaryPage the dictionary page for this column chunk (might be null)
 * @param bytes the encoded pages including page headers to be written as is
 * @param uncompressedTotalPageSize total uncompressed size (without page headers)
 * @param compressedTotalPageSize total compressed size (without page headers)
 * @param totalStats accumulated statistics for the column chunk
 * @param columnIndexBuilder the builder object for the column index
 * @param offsetIndexBuilder the builder object for the offset index
 * @param bloomFilter the bloom filter for this column
 * @param rlEncodings the RL encodings used in this column chunk
 * @param dlEncodings the DL encodings used in this column chunk
 * @param dataEncodings the data encodings used in this column chunk
 * @throws IOException if there is an error while writing
 */
void writeColumnChunk(ColumnDescriptor descriptor,
    long valueCount,
    CompressionCodecName compressionCodecName,
    DictionaryPage dictionaryPage,
    BytesInput bytes,
    long uncompressedTotalPageSize,
    long compressedTotalPageSize,
    Statistics<?> totalStats,
    ColumnIndexBuilder columnIndexBuilder,
    OffsetIndexBuilder offsetIndexBuilder,
    BloomFilter bloomFilter,
    Set<Encoding> rlEncodings,
    Set<Encoding> dlEncodings,
    List<Encoding> dataEncodings) throws IOException {
  startColumn(descriptor, valueCount, compressionCodecName);

  state = state.write();
  if (dictionaryPage != null) {
    writeDictionaryPage(dictionaryPage);
  }

  if (bloomFilter != null) {
    // write bloom filter if one of data pages is not dictionary encoded
    boolean isWriteBloomFilter = false;
    for (Encoding encoding : dataEncodings) {
      if (encoding != Encoding.RLE_DICTIONARY) {
        isWriteBloomFilter = true;
        break;
      }
    }
    if (isWriteBloomFilter) {
      currentBloomFilters.put(String.join(".", descriptor.getPath()), bloomFilter);
    }
  }
  LOG.debug("{}: write data pages", out.getPos());
  long headersSize = bytes.size() - compressedTotalPageSize;
  this.uncompressedLength += uncompressedTotalPageSize + headersSize;
  this.compressedLength += compressedTotalPageSize + headersSize;
  LOG.debug("{}: write data pages content", out.getPos());
  firstPageOffset = out.getPos();
  bytes.writeAllTo(out);
  encodingStatsBuilder.addDataEncodings(dataEncodings);
  if (rlEncodings.isEmpty()) {
    encodingStatsBuilder.withV2Pages();
  }
  currentEncodings.addAll(rlEncodings);
  currentEncodings.addAll(dlEncodings);
  currentEncodings.addAll(dataEncodings);
  currentStatistics = totalStats;

  this.columnIndexBuilder = columnIndexBuilder;
  this.offsetIndexBuilder = offsetIndexBuilder;

  endColumn();
}