org.apache.parquet.io.ParquetDecodingException Java Exaples

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param configuration the configuration to connect to the file system
 * @param footers the footers of the files to read
 * @return the splits for the footers
 * @throws IOException if there is an error while reading
 * @deprecated split planning using file footers will be removed
 */
@Deprecated
public List<ParquetInputSplit> getSplits(Configuration configuration, List<Footer> footers) throws IOException {
  boolean strictTypeChecking = configuration.getBoolean(STRICT_TYPE_CHECKING, true);
  final long maxSplitSize = configuration.getLong("mapred.max.split.size", Long.MAX_VALUE);
  final long minSplitSize = Math.max(getFormatMinSplitSize(), configuration.getLong("mapred.min.split.size", 0L));
  if (maxSplitSize < 0 || minSplitSize < 0) {
    throw new ParquetDecodingException("maxSplitSize or minSplitSize should not be negative: maxSplitSize = " + maxSplitSize + "; minSplitSize = " + minSplitSize);
  }
  GlobalMetaData globalMetaData = ParquetFileWriter.getGlobalMetaData(footers, strictTypeChecking);
  ReadContext readContext = getReadSupport(configuration).init(new InitContext(
      configuration,
      globalMetaData.getKeyValueMetaData(),
      globalMetaData.getSchema()));

  return new ClientSideMetadataSplitStrategy().getSplits(
      configuration, footers, maxSplitSize, minSplitSize, readContext);
}

Source File: TupleReadSupport.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param fileSchema the parquet schema from the file
 * @param keyValueMetaData the extra meta data from the files
 * @return the pig schema according to the file
 */
static Schema getPigSchemaFromMultipleFiles(MessageType fileSchema, Map<String, Set<String>> keyValueMetaData) {
  Set<String> pigSchemas = PigMetaData.getPigSchemas(keyValueMetaData);
  if (pigSchemas == null) {
    return pigSchemaConverter.convert(fileSchema);
  }
  Schema mergedPigSchema = null;
  for (String pigSchemaString : pigSchemas) {
    try {
      mergedPigSchema = union(mergedPigSchema, parsePigSchema(pigSchemaString));
    } catch (FrontendException e) {
      throw new ParquetDecodingException("can not merge " + pigSchemaString + " into " + mergedPigSchema, e);
    }
  }
  return mergedPigSchema;
}

Source File: UnmaterializableRecordCounter.java From parquet-mr with Apache License 2.0

6 votes

public void incErrors(RecordMaterializationException cause) throws ParquetDecodingException {
  numErrors++;

  LOG.warn(String.format("Error while reading an input record (%s out of %s): ",
      numErrors, totalNumRecords), cause);

  if (numErrors > 0 && errorThreshold <= 0) { // no errors are tolerated
    throw new ParquetDecodingException("Error while decoding records", cause);
  }

  double errRate = numErrors/(double)totalNumRecords;

  if (errRate > errorThreshold) {
    String message = String.format("Decoding error rate of at least %s/%s crosses configured threshold of %s",
        numErrors, totalNumRecords, errorThreshold);
    LOG.error(message);
    throw new ParquetDecodingException(message, cause);
  }
}

Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0

6 votes

private void readPageV1(DataPageV1 page) {
  ValuesReader rlReader = page.getRlEncoding().getValuesReader(path, REPETITION_LEVEL);
  ValuesReader dlReader = page.getDlEncoding().getValuesReader(path, DEFINITION_LEVEL);
  this.repetitionLevelColumn = new ValuesReaderIntIterator(rlReader);
  this.definitionLevelColumn = new ValuesReaderIntIterator(dlReader);
  int valueCount = page.getValueCount();
  try {
    BytesInput bytes = page.getBytes();
    LOG.debug("page size {} bytes and {} values", bytes.size(), valueCount);
    LOG.debug("reading repetition levels at 0");
    ByteBufferInputStream in = bytes.toInputStream();
    rlReader.initFromPage(valueCount, in);
    LOG.debug("reading definition levels at {}", in.position());
    dlReader.initFromPage(valueCount, in);
    LOG.debug("reading data at {}", in.position());
    initDataReader(page.getValueEncoding(), in, valueCount);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read page " + page + " in col " + path, e);
  }
  newPageInitialized(page);
}

Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0

6 votes

/**
 * creates a reader for triplets
 * @param path the descriptor for the corresponding column
 * @param pageReader the underlying store to read from
 * @param converter a converter that materializes the values in this column in the current record
 * @param writerVersion writer version string from the Parquet file being read
 */
ColumnReaderBase(ColumnDescriptor path, PageReader pageReader, PrimitiveConverter converter, ParsedVersion writerVersion) {
  this.path = Objects.requireNonNull(path, "path cannot be null");
  this.pageReader = Objects.requireNonNull(pageReader, "pageReader cannot be null");
  this.converter = Objects.requireNonNull(converter, "converter cannot be null");
  this.writerVersion = writerVersion;
  this.maxDefinitionLevel = path.getMaxDefinitionLevel();
  DictionaryPage dictionaryPage = pageReader.readDictionaryPage();
  if (dictionaryPage != null) {
    try {
      this.dictionary = dictionaryPage.getEncoding().initDictionary(path, dictionaryPage);
      if (converter.hasDictionarySupport()) {
        converter.setDictionary(dictionary);
      }
    } catch (IOException e) {
      throw new ParquetDecodingException("could not decode the dictionary for " + path, e);
    }
  } else {
    this.dictionary = null;
  }
  this.totalValueCount = pageReader.getTotalValueCount();
  if (totalValueCount <= 0) {
    throw new ParquetDecodingException("totalValueCount '" + totalValueCount + "' <= 0");
  }
}

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

6 votes

/**
 * @param rowGroupMetadata
 * @return true if the mid point of row group is in a new hdfs block, and also move the currentHDFSBlock pointer to the correct index that contains the row group;
 * return false if the mid point of row group is in the same hdfs block
 */
private boolean checkBelongingToANewHDFSBlock(BlockMetaData rowGroupMetadata) {
  boolean isNewHdfsBlock = false;
  long rowGroupMidPoint = rowGroupMetadata.getStartingPos() + (rowGroupMetadata.getCompressedSize() / 2);

  //if mid point is not in the current HDFS block any more, return true
  while (rowGroupMidPoint > getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex)) {
    isNewHdfsBlock = true;
    currentMidPointHDFSBlockIndex++;
    if (currentMidPointHDFSBlockIndex >= hdfsBlocks.length)
      throw new ParquetDecodingException("the row group is not in hdfs blocks in the file: midpoint of row groups is "
              + rowGroupMidPoint
              + ", the end of the hdfs block is "
              + getHDFSBlockEndingPosition(currentMidPointHDFSBlockIndex - 1));
  }

  while (rowGroupMetadata.getStartingPos() > getHDFSBlockEndingPosition(currentStartHdfsBlockIndex)) {
    currentStartHdfsBlockIndex++;
    if (currentStartHdfsBlockIndex >= hdfsBlocks.length)
      throw new ParquetDecodingException("The row group does not start in this file: row group offset is "
              + rowGroupMetadata.getStartingPos()
              + " but the end of hdfs blocks of file is "
              + getHDFSBlockEndingPosition(currentStartHdfsBlockIndex));
  }
  return isNewHdfsBlock;
}

Source File: PageIterator.java From iceberg with Apache License 2.0

6 votes

RuntimeException handleRuntimeException(RuntimeException exception) {
  if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, valueEncoding) &&
      exception instanceof ArrayIndexOutOfBoundsException) {
    // this is probably PARQUET-246, which may happen if reading data with
    // MR because this can't be detected without reading all footers
    throw new ParquetDecodingException("Read failure possibly due to " +
        "PARQUET-246: try setting parquet.split.files to false",
        new ParquetDecodingException(
            String.format("Can't read value in column %s at value %d out of %d in current page. " +
                          "repetition level: %d, definition level: %d",
                desc, triplesRead, triplesCount, currentRL, currentDL),
            exception));
  }
  throw new ParquetDecodingException(
      String.format("Can't read value in column %s at value %d out of %d in current page. " +
                    "repetition level: %d, definition level: %d",
          desc, triplesRead, triplesCount, currentRL, currentDL),
      exception);
}

Source File: BooleanColumnReader.java From flink with Apache License 2.0

6 votes

private boolean readBoolean() {
	if (bitOffset == 0) {
		try {
			currentByte = (byte) dataInputStream.read();
		} catch (IOException e) {
			throw new ParquetDecodingException("Failed to read a byte", e);
		}
	}

	boolean v = (currentByte & (1 << bitOffset)) != 0;
	bitOffset += 1;
	if (bitOffset == 8) {
		bitOffset = 0;
	}
	return v;
}

Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0

6 votes

public ElementConverter(String listName, List<TProtocol> listEvents,
                        GroupType repeatedType, ThriftField thriftElement) {
  this.listEvents = listEvents;
  this.elementEvents = new ArrayList<TProtocol>();
  Type elementType = repeatedType.getType(0);
  if (elementType.isRepetition(Type.Repetition.OPTIONAL)) {
    if (ignoreNullElements) {
      LOG.warn("List " + listName +
          " has optional elements: null elements are ignored.");
    } else {
      throw new ParquetDecodingException("Cannot read list " + listName +
          " with optional elements: set " + IGNORE_NULL_LIST_ELEMENTS +
          " to ignore nulls.");
    }
  }
  elementConverter = newConverter(elementEvents, elementType, thriftElement);
}

Source File: ParquetTypeUtils.java From presto with Apache License 2.0

6 votes

@SuppressWarnings("deprecation")
public static ParquetEncoding getParquetEncoding(Encoding encoding)
{
    switch (encoding) {
        case PLAIN:
            return ParquetEncoding.PLAIN;
        case RLE:
            return ParquetEncoding.RLE;
        case BIT_PACKED:
            return ParquetEncoding.BIT_PACKED;
        case PLAIN_DICTIONARY:
            return ParquetEncoding.PLAIN_DICTIONARY;
        case DELTA_BINARY_PACKED:
            return ParquetEncoding.DELTA_BINARY_PACKED;
        case DELTA_LENGTH_BYTE_ARRAY:
            return ParquetEncoding.DELTA_LENGTH_BYTE_ARRAY;
        case DELTA_BYTE_ARRAY:
            return ParquetEncoding.DELTA_BYTE_ARRAY;
        case RLE_DICTIONARY:
            return ParquetEncoding.RLE_DICTIONARY;
        default:
            throw new ParquetDecodingException("Unsupported Parquet encoding: " + encoding);
    }
}

Source File: RunLengthBitPackingHybridDecoder.java From parquet-mr with Apache License 2.0

6 votes

public int readInt() throws IOException {
  if (currentCount == 0) {
    readNext();
  }
  -- currentCount;
  int result;
  switch (mode) {
  case RLE:
    result = currentValue;
    break;
  case PACKED:
    result = currentBuffer[currentBuffer.length - 1 - currentCount];
    break;
  default:
    throw new ParquetDecodingException("not a valid mode " + mode);
  }
  return result;
}

Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0

6 votes

private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) {
  // splitting files?
  if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) {
    // this is okay if not using DELTA_BYTE_ARRAY with the bug
    Set<Encoding> encodings = new HashSet<Encoding>();
    for (ColumnChunkMetaData column : block.getColumns()) {
      encodings.addAll(column.getEncodings());
    }
    for (Encoding encoding : encodings) {
      if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) {
        throw new ParquetDecodingException("Cannot read data due to " +
            "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false");
      }
    }
  }
}

Source File: PrimitiveColumnReader.java From presto with Apache License 2.0

6 votes

public void setPageReader(PageReader pageReader)
{
    this.pageReader = requireNonNull(pageReader, "pageReader");
    DictionaryPage dictionaryPage = pageReader.readDictionaryPage();

    if (dictionaryPage != null) {
        try {
            dictionary = dictionaryPage.getEncoding().initDictionary(columnDescriptor, dictionaryPage);
        }
        catch (IOException e) {
            throw new ParquetDecodingException("could not decode the dictionary for " + columnDescriptor, e);
        }
    }
    else {
        dictionary = null;
    }
    checkArgument(pageReader.getTotalValueCount() > 0, "page is empty");
    totalValueCount = pageReader.getTotalValueCount();
}

Source File: PrimitiveColumnReader.java From presto with Apache License 2.0

6 votes

private ValuesReader readPageV1(DataPageV1 page)
{
    ValuesReader rlReader = page.getRepetitionLevelEncoding().getValuesReader(columnDescriptor, REPETITION_LEVEL);
    ValuesReader dlReader = page.getDefinitionLevelEncoding().getValuesReader(columnDescriptor, DEFINITION_LEVEL);
    repetitionReader = new LevelValuesReader(rlReader);
    definitionReader = new LevelValuesReader(dlReader);
    try {
        ByteBufferInputStream in = toInputStream(page.getSlice());
        rlReader.initFromPage(page.getValueCount(), in);
        dlReader.initFromPage(page.getValueCount(), in);
        return initDataReader(page.getValueEncoding(), page.getValueCount(), in);
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page " + page + " in column " + columnDescriptor, e);
    }
}

Source File: PrimitiveColumnReader.java From presto with Apache License 2.0

6 votes

private ValuesReader initDataReader(ParquetEncoding dataEncoding, int valueCount, ByteBufferInputStream in)
{
    ValuesReader valuesReader;
    if (dataEncoding.usesDictionary()) {
        if (dictionary == null) {
            throw new ParquetDecodingException("Dictionary is missing for Page");
        }
        valuesReader = dataEncoding.getDictionaryBasedValuesReader(columnDescriptor, VALUES, dictionary);
    }
    else {
        valuesReader = dataEncoding.getValuesReader(columnDescriptor, VALUES);
    }

    try {
        valuesReader.initFromPage(valueCount, in);
        return valuesReader;
    }
    catch (IOException e) {
        throw new ParquetDecodingException("Error reading parquet page in column " + columnDescriptor, e);
    }
}

Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0

6 votes

@Override
public void addBinary(final Binary value) {
  final Integer id = enumLookup.get(value);

  if (id == null) {
    throw new ParquetDecodingException("Unrecognized enum value: "
        + value.toStringUsingUTF8()
        + " known values: "
        + enumLookup
        + " in " + this.field);
  }

  events.add(new ParquetProtocol("readI32() enum") {
    @Override
    public int readI32() throws TException {
      return id;
    }
  });
}

Source File: DeltaBinaryPackingValuesReader.java From parquet-mr with Apache License 2.0

6 votes

private void loadNewBlockToBuffer() throws IOException {
  try {
    minDeltaInCurrentBlock = BytesUtils.readZigZagVarLong(in);
  } catch (IOException e) {
    throw new ParquetDecodingException("can not read min delta in current block", e);
  }

  readBitWidthsForMiniBlocks();

  // mini block is atomic for reading, we read a mini block when there are more values left
  int i;
  for (i = 0; i < config.miniBlockNumInABlock && valuesBuffered < totalValueCount; i++) {
    BytePackerForLong packer = Packer.LITTLE_ENDIAN.newBytePackerForLong(bitWidths[i]);
    unpackMiniBlock(packer);
  }

  //calculate values from deltas unpacked for current block
  int valueUnpacked=i*config.miniBlockSizeInValues;
  for (int j = valuesBuffered-valueUnpacked; j < valuesBuffered; j++) {
    int index = j;
    valuesBuffer[index] += minDeltaInCurrentBlock + valuesBuffer[index - 1];
  }
}

Source File: ColumnChunkPageReadStore.java From parquet-mr with Apache License 2.0

6 votes

@Override
public DictionaryPage readDictionaryPage() {
  if (compressedDictionaryPage == null) {
    return null;
  }
  try {
    DictionaryPage decompressedPage = new DictionaryPage(
      decompressor.decompress(compressedDictionaryPage.getBytes(), compressedDictionaryPage.getUncompressedSize()),
      compressedDictionaryPage.getDictionarySize(),
      compressedDictionaryPage.getEncoding());
    if (compressedDictionaryPage.getCrc().isPresent()) {
      decompressedPage.setCrc(compressedDictionaryPage.getCrc().getAsInt());
    }
    return decompressedPage;
  } catch (IOException e) {
    throw new ParquetDecodingException("Could not decompress dictionary page", e);
  }
}

Source File: ProtoMessageConverter.java From parquet-mr with Apache License 2.0

6 votes

public ListConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) {
  LogicalTypeAnnotation logicalTypeAnnotation = parquetType.getLogicalTypeAnnotation();
  if (!(logicalTypeAnnotation instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) || parquetType.isPrimitive()) {
    throw new ParquetDecodingException("Expected LIST wrapper. Found: " + logicalTypeAnnotation + " instead.");
  }

  GroupType rootWrapperType = parquetType.asGroupType();
  if (!rootWrapperType.containsField("list") || rootWrapperType.getType("list").isPrimitive()) {
    throw new ParquetDecodingException("Expected repeated 'list' group inside LIST wrapperr but got: " + rootWrapperType);
  }

  GroupType listType = rootWrapperType.getType("list").asGroupType();
  if (!listType.containsField("element")) {
    throw new ParquetDecodingException("Expected 'element' inside repeated list group but got: " + listType);
  }

  Type elementType = listType.getType("element");
  converter = newMessageConverter(parentBuilder, fieldDescriptor, elementType);
}

Source File: TupleReadSupport.java From parquet-mr with Apache License 2.0

6 votes

@Override
public RecordMaterializer<Tuple> prepareForRead(
    Configuration configuration,
    Map<String, String> keyValueMetaData,
    MessageType fileSchema,
    ReadContext readContext) {
  MessageType requestedSchema = readContext.getRequestedSchema();
  Schema requestedPigSchema = getPigSchema(configuration);

  if (requestedPigSchema == null) {
    throw new ParquetDecodingException("Missing Pig schema: ParquetLoader sets the schema in the job conf");
  }
  boolean elephantBirdCompatible = configuration.getBoolean(PARQUET_PIG_ELEPHANT_BIRD_COMPATIBLE, false);
  boolean columnIndexAccess = configuration.getBoolean(PARQUET_COLUMN_INDEX_ACCESS, false);
  if (elephantBirdCompatible) {
    LOG.info("Numbers will default to 0 instead of NULL; Boolean will be converted to Int");
  }
  return new TupleRecordMaterializer(requestedSchema, requestedPigSchema, elephantBirdCompatible, columnIndexAccess);
}

Source File: ByteStreamSplitValuesReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void skip(int n) {
  if (n < 0 || indexInStream + n > valuesCount) {
    String errorMessage = String.format(
            "Cannot skip this many elements. Current index: %d. Skip %d. Total number of elements: %d",
            indexInStream, n, valuesCount);
    throw new ParquetDecodingException(errorMessage);
  }
  indexInStream += n;
}

Source File: PlainValuesReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void skip(int n) {
  try {
    in.skipBytes(n * 8);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not skip " + n + " longs", e);
  }
}

Source File: PlainValuesReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
public int readInteger() {
  try {
    return in.readInt();
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read int", e);
  }
}

Source File: AvroConverters.java From parquet-mr with Apache License 2.0

5 votes

public FieldStringableConverter(ParentValueContainer parent,
                                Class<?> stringableClass) {
  super(parent);
  stringableName = stringableClass.getName();
  try {
    this.ctor = stringableClass.getConstructor(String.class);
  } catch (NoSuchMethodException e) {
    throw new ParquetDecodingException(
        "Unable to get String constructor for " + stringableName, e);
  }
}

Source File: PlainValuesReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void skip(int n) {
  try {
    skipBytesFully(n * 8);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not skip " + n + " double values", e);
  }
}

Source File: RunLengthBitPackingHybridValuesReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
public int readInteger() {
  try {
    return decoder.readInt();
  } catch (IOException e) {
    throw new ParquetDecodingException(e);
  }
}

Source File: BitPackingValuesReader.java From parquet-mr with Apache License 2.0

5 votes

/**
 * {@inheritDoc}
 * @see org.apache.parquet.column.values.ValuesReader#readInteger()
 */
@Override
public int readInteger() {
  try {
    return bitPackingReader.read();
  } catch (IOException e) {
    throw new ParquetDecodingException(e);
  }
}

Source File: ParquetInputFormat.java From parquet-mr with Apache License 2.0

5 votes

private static void checkSorted(List<BlockMetaData> rowGroupBlocks) {
  long previousOffset = 0L;
  for(BlockMetaData rowGroup: rowGroupBlocks) {
    long currentOffset = rowGroup.getStartingPos();
    if (currentOffset < previousOffset) {
      throw new ParquetDecodingException("row groups are not sorted: previous row groups starts at " + previousOffset + ", current row group starts at " + currentOffset);
    }
  }
}

Source File: PlainValuesReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
public double readDouble() {
  try {
    return in.readDouble();
  } catch (IOException e) {
    throw new ParquetDecodingException("could not read double", e);
  }
}

Source File: PlainValuesReader.java From parquet-mr with Apache License 2.0

5 votes

@Override
public void skip(int n) {
  try {
    skipBytesFully(n * 4);
  } catch (IOException e) {
    throw new ParquetDecodingException("could not skip " + n + " floats", e);
  }
}

org.apache.parquet.io.ParquetDecodingException Java Examples