org.apache.parquet.CorruptDeltaByteArrays Java Examples
The following examples show how to use
org.apache.parquet.CorruptDeltaByteArrays.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: PageIterator.java From iceberg with Apache License 2.0 | 6 votes |
RuntimeException handleRuntimeException(RuntimeException exception) { if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, valueEncoding) && exception instanceof ArrayIndexOutOfBoundsException) { // this is probably PARQUET-246, which may happen if reading data with // MR because this can't be detected without reading all footers throw new ParquetDecodingException("Read failure possibly due to " + "PARQUET-246: try setting parquet.split.files to false", new ParquetDecodingException( String.format("Can't read value in column %s at value %d out of %d in current page. " + "repetition level: %d, definition level: %d", desc, triplesRead, triplesCount, currentRL, currentDL), exception)); } throw new ParquetDecodingException( String.format("Can't read value in column %s at value %d out of %d in current page. " + "repetition level: %d, definition level: %d", desc, triplesRead, triplesCount, currentRL, currentDL), exception); }
Example #2
Source File: PageIterator.java From iceberg with Apache License 2.0 | 6 votes |
RuntimeException handleRuntimeException(RuntimeException e) { if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, valueEncoding) && e instanceof ArrayIndexOutOfBoundsException) { // this is probably PARQUET-246, which may happen if reading data with // MR because this can't be detected without reading all footers throw new ParquetDecodingException("Read failure possibly due to " + "PARQUET-246: try setting parquet.split.files to false", new ParquetDecodingException( format("Can't read value in column %s at value %d out of %d in current page. " + "repetition level: %d, definition level: %d", desc, triplesRead, triplesCount, currentRL, currentDL), e)); } throw new ParquetDecodingException( format("Can't read value in column %s at value %d out of %d in current page. " + "repetition level: %d, definition level: %d", desc, triplesRead, triplesCount, currentRL, currentDL), e); }
Example #3
Source File: TestCorruptDeltaByteArrays.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testCorruptDeltaByteArrayVerisons() { assertTrue(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.6.0 (build abcd)", Encoding.DELTA_BYTE_ARRAY)); assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((String) null, Encoding.DELTA_BYTE_ARRAY)); assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((ParsedVersion) null, Encoding.DELTA_BYTE_ARRAY)); assertTrue(CorruptDeltaByteArrays.requiresSequentialReads((SemanticVersion) null, Encoding.DELTA_BYTE_ARRAY)); assertTrue(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.DELTA_BYTE_ARRAY)); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.6.0 (build abcd)", Encoding.DELTA_BINARY_PACKED)); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((String) null, Encoding.DELTA_LENGTH_BYTE_ARRAY)); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((ParsedVersion) null, Encoding.PLAIN)); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads((SemanticVersion) null, Encoding.RLE)); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.RLE_DICTIONARY)); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.PLAIN_DICTIONARY)); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0-SNAPSHOT (build abcd)", Encoding.BIT_PACKED)); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads("parquet-mr version 1.8.0 (build abcd)", Encoding.DELTA_BYTE_ARRAY)); }
Example #4
Source File: ParquetRecordReader.java From parquet-mr with Apache License 2.0 | 6 votes |
private void checkDeltaByteArrayProblem(FileMetaData meta, Configuration conf, BlockMetaData block) { // splitting files? if (conf.getBoolean(ParquetInputFormat.SPLIT_FILES, true)) { // this is okay if not using DELTA_BYTE_ARRAY with the bug Set<Encoding> encodings = new HashSet<Encoding>(); for (ColumnChunkMetaData column : block.getColumns()) { encodings.addAll(column.getEncodings()); } for (Encoding encoding : encodings) { if (CorruptDeltaByteArrays.requiresSequentialReads(meta.getCreatedBy(), encoding)) { throw new ParquetDecodingException("Cannot read data due to " + "PARQUET-246: to read safely, set " + SPLIT_FILES + " to false"); } } } }
Example #5
Source File: VectorizedPageIterator.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) { ValuesReader previousReader = plainValuesReader; if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException( "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding); } try { dictionaryEncodedValuesReader = new VectorizedDictionaryEncodedParquetValuesReader(desc.getMaxDefinitionLevel(), setArrowValidityVector); dictionaryEncodedValuesReader.initFromPage(valueCount, in); if (ParquetUtil.isIntType(desc.getPrimitiveType()) || !allPagesDictEncoded) { dictionaryDecodeMode = DictionaryDecodeMode.EAGER; } else { dictionaryDecodeMode = DictionaryDecodeMode.LAZY; } } catch (IOException e) { throw new ParquetDecodingException("could not read page in col " + desc, e); } } else { plainValuesReader = new ValuesAsBytesReader(); plainValuesReader.initFromPage(valueCount, in); dictionaryDecodeMode = DictionaryDecodeMode.NONE; } if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && previousReader != null && previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially ((RequiresPreviousReader) plainValuesReader).setPreviousReader(previousReader); } }
Example #6
Source File: PageIterator.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) { ValuesReader previousReader = values; this.valueEncoding = dataEncoding; // TODO: May want to change this so that this class is not dictionary-aware. // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException( "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding); } this.values = dataEncoding.getDictionaryBasedValuesReader(desc, ValuesType.VALUES, dictionary); } else { this.values = dataEncoding.getValuesReader(desc, ValuesType.VALUES); } // if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { // bindToDictionary(dictionary); // } else { // bind(path.getType()); // } try { values.initFromPage(valueCount, in); } catch (IOException e) { throw new ParquetDecodingException("could not read page in col " + desc, e); } if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially ((RequiresPreviousReader) values).setPreviousReader(previousReader); } }
Example #7
Source File: PageIterator.java From iceberg with Apache License 2.0 | 5 votes |
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) { ValuesReader previousReader = values; this.valueEncoding = dataEncoding; // TODO: May want to change this so that this class is not dictionary-aware. // For dictionary columns, this class could rely on wrappers to correctly handle dictionaries // This isn't currently possible because RLE must be read by getDictionaryBasedValuesReader if (dataEncoding.usesDictionary()) { if (dict == null) { throw new ParquetDecodingException( "could not read page in col " + desc + " as the dictionary was missing for encoding " + dataEncoding); } this.values = dataEncoding.getDictionaryBasedValuesReader(desc, VALUES, dict); } else { this.values = dataEncoding.getValuesReader(desc, VALUES); } // if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { // bindToDictionary(dictionary); // } else { // bind(path.getType()); // } try { values.initFromPage(valueCount, in); } catch (IOException e) { throw new ParquetDecodingException("could not read page in col " + desc, e); } if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && previousReader != null && previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially ((RequiresPreviousReader) values).setPreviousReader(previousReader); } }
Example #8
Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0 | 5 votes |
/** * Reads the value into the binding. */ public void readValue() { try { if (!valueRead) { binding.read(); valueRead = true; } } catch (RuntimeException e) { if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, currentEncoding) && e instanceof ArrayIndexOutOfBoundsException) { // this is probably PARQUET-246, which may happen if reading data with // MR because this can't be detected without reading all footers throw new ParquetDecodingException("Read failure possibly due to " + "PARQUET-246: try setting parquet.split.files to false", new ParquetDecodingException( format("Can't read value in column %s at value %d out of %d, " + "%d out of %d in currentPage. repetition level: " + "%d, definition level: %d", path, readValues, totalValueCount, readValues - (endOfPageValueCount - pageValueCount), pageValueCount, repetitionLevel, definitionLevel), e)); } throw new ParquetDecodingException( format("Can't read value in column %s at value %d out of %d, " + "%d out of %d in currentPage. repetition level: " + "%d, definition level: %d", path, readValues, totalValueCount, readValues - (endOfPageValueCount - pageValueCount), pageValueCount, repetitionLevel, definitionLevel), e); } }
Example #9
Source File: ColumnReaderBase.java From parquet-mr with Apache License 2.0 | 5 votes |
private void initDataReader(Encoding dataEncoding, ByteBufferInputStream in, int valueCount) { ValuesReader previousReader = this.dataColumn; this.currentEncoding = dataEncoding; this.pageValueCount = valueCount; this.endOfPageValueCount = readValues + pageValueCount; if (dataEncoding.usesDictionary()) { if (dictionary == null) { throw new ParquetDecodingException( "could not read page in col " + path + " as the dictionary was missing for encoding " + dataEncoding); } this.dataColumn = dataEncoding.getDictionaryBasedValuesReader(path, VALUES, dictionary); } else { this.dataColumn = dataEncoding.getValuesReader(path, VALUES); } if (dataEncoding.usesDictionary() && converter.hasDictionarySupport()) { bindToDictionary(dictionary); } else { bind(path.getType()); } try { dataColumn.initFromPage(pageValueCount, in); } catch (IOException e) { throw new ParquetDecodingException("could not read page in col " + path, e); } if (CorruptDeltaByteArrays.requiresSequentialReads(writerVersion, dataEncoding) && previousReader != null && previousReader instanceof RequiresPreviousReader) { // previous reader can only be set if reading sequentially ((RequiresPreviousReader) dataColumn).setPreviousReader(previousReader); } }
Example #10
Source File: TestCorruptDeltaByteArrays.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testEncodingRequiresSequentailRead() { ParsedVersion impala = new ParsedVersion("impala", "1.2.0", "abcd"); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads(impala, Encoding.DELTA_BYTE_ARRAY)); ParsedVersion broken = new ParsedVersion("parquet-mr", "1.8.0-SNAPSHOT", "abcd"); assertTrue(CorruptDeltaByteArrays.requiresSequentialReads(broken, Encoding.DELTA_BYTE_ARRAY)); ParsedVersion fixed = new ParsedVersion("parquet-mr", "1.8.0", "abcd"); assertFalse(CorruptDeltaByteArrays.requiresSequentialReads(fixed, Encoding.DELTA_BYTE_ARRAY)); }