org.apache.parquet.format.ConvertedType Java Examples
The following examples show how to use
org.apache.parquet.format.ConvertedType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testIncompatibleLogicalAndConvertedTypes() { ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); MessageType schema = Types.buildMessage() .required(PrimitiveTypeName.BINARY) .as(OriginalType.DECIMAL).precision(9).scale(2) .named("aBinary") .named("Message"); MessageType expected = Types.buildMessage() .required(PrimitiveTypeName.BINARY) .as(LogicalTypeAnnotation.jsonType()) .named("aBinary") .named("Message"); List<SchemaElement> parquetSchema = parquetMetadataConverter.toParquetSchema(schema); // Set converted type field to a different type to verify that in case of mismatch, it overrides logical type parquetSchema.get(1).setConverted_type(ConvertedType.JSON); MessageType actual = parquetMetadataConverter.fromParquetSchema(parquetSchema, null); assertEquals(expected, actual); }
Example #2
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.IntLogicalTypeAnnotation intLogicalType) { boolean signed = intLogicalType.isSigned(); switch (intLogicalType.getBitWidth()) { case 8: return of(signed ? ConvertedType.INT_8 : ConvertedType.UINT_8); case 16: return of(signed ? ConvertedType.INT_16 : ConvertedType.UINT_16); case 32: return of(signed ? ConvertedType.INT_32 : ConvertedType.UINT_32); case 64: return of(signed ? ConvertedType.INT_64 : ConvertedType.UINT_64); default: throw new RuntimeException("Unknown original type " + intLogicalType.toOriginalType()); } }
Example #3
Source File: ColumnReaderFactory.java From Bats with Apache License 2.0 | 5 votes |
static VarLengthValuesColumn<?> getReader(ParquetRecordReader parentReader, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v, SchemaElement schemaElement ) throws ExecutionSetupException { ConvertedType convertedType = schemaElement.getConverted_type(); switch (descriptor.getMaxDefinitionLevel()) { case 0: if (convertedType == null) { return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: case ENUM: return new VarLengthColumnReaders.VarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: if (v instanceof VarDecimalVector) { return new VarLengthColumnReaders.VarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarDecimalVector) v, schemaElement); } default: return new VarLengthColumnReaders.VarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } default: if (convertedType == null) { return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: case ENUM: return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarCharVector) v, schemaElement); case DECIMAL: if (v instanceof NullableVarDecimalVector) { return new VarLengthColumnReaders.NullableVarDecimalColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) v, schemaElement); } default: return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, descriptor, columnChunkMetaData, fixedLength, (NullableVarBinaryVector) v, schemaElement); } } }
Example #4
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.TimeLogicalTypeAnnotation timeLogicalType) { switch (timeLogicalType.getUnit()) { case MILLIS: return of(ConvertedType.TIME_MILLIS); case MICROS: return of(ConvertedType.TIME_MICROS); case NANOS: return empty(); default: throw new RuntimeException("Unknown converted type for " + timeLogicalType.toOriginalType()); } }
Example #5
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.TimestampLogicalTypeAnnotation timestampLogicalType) { switch (timestampLogicalType.getUnit()) { case MICROS: return of(ConvertedType.TIMESTAMP_MICROS); case MILLIS: return of(ConvertedType.TIMESTAMP_MILLIS); case NANOS: return empty(); default: throw new RuntimeException("Unknown converted type for " + timestampLogicalType.toOriginalType()); } }
Example #6
Source File: ColumnReaderFactory.java From dremio-oss with Apache License 2.0 | 5 votes |
static VarLengthValuesColumn<?> getReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor descriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector v, SchemaElement schemaElement ) throws ExecutionSetupException { ConvertedType convertedType = schemaElement.getConverted_type(); switch (descriptor.getMaxDefinitionLevel()) { case 0: if (convertedType == null) { return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: return new VarLengthColumnReaders.VarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: return new VarLengthColumnReaders.Decimal28Column(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement); default: return new VarLengthColumnReaders.VarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } default: if (convertedType == null) { return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } switch (convertedType) { case UTF8: return new VarLengthColumnReaders.NullableVarCharColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarCharVector) v, schemaElement); case DECIMAL: return new NullableDecimalColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (DecimalVector) v, schemaElement); default: return new VarLengthColumnReaders.NullableVarBinaryColumn(parentReader, allocateSize, descriptor, columnChunkMetaData, fixedLength, (VarBinaryVector) v, schemaElement); } } }
Example #7
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Test public void testSchemaConverterDecimal() { ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); List<SchemaElement> schemaElements = parquetMetadataConverter.toParquetSchema( Types.buildMessage() .required(PrimitiveTypeName.BINARY) .as(OriginalType.DECIMAL).precision(9).scale(2) .named("aBinaryDecimal") .optional(PrimitiveTypeName.FIXED_LEN_BYTE_ARRAY).length(4) .as(OriginalType.DECIMAL).precision(9).scale(2) .named("aFixedDecimal") .named("Message") ); List<SchemaElement> expected = Lists.newArrayList( new SchemaElement("Message").setNum_children(2), new SchemaElement("aBinaryDecimal") .setRepetition_type(FieldRepetitionType.REQUIRED) .setType(Type.BYTE_ARRAY) .setConverted_type(ConvertedType.DECIMAL) .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9))) .setPrecision(9).setScale(2), new SchemaElement("aFixedDecimal") .setRepetition_type(FieldRepetitionType.OPTIONAL) .setType(Type.FIXED_LEN_BYTE_ARRAY) .setType_length(4) .setConverted_type(ConvertedType.DECIMAL) .setLogicalType(LogicalType.DECIMAL(new DecimalType(2, 9))) .setPrecision(9).setScale(2) ); Assert.assertEquals(expected, schemaElements); }
Example #8
Source File: ParquetToDrillTypeConverter.java From Bats with Apache License 2.0 | 5 votes |
public static TypeProtos.MajorType toMajorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length, TypeProtos.DataMode mode, SchemaElement schemaElement, OptionManager options) { ConvertedType convertedType = schemaElement.getConverted_type(); MinorType minorType = getMinorType(primitiveTypeName, length, convertedType, options); TypeProtos.MajorType.Builder typeBuilder = TypeProtos.MajorType.newBuilder().setMinorType(minorType).setMode(mode); if (Types.isDecimalType(minorType)) { int precision = schemaElement.getPrecision(); int scale = schemaElement.getScale(); typeBuilder.setPrecision(precision).setScale(scale); } return typeBuilder.build(); }
Example #9
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
private void buildChildren(Types.GroupBuilder builder, Iterator<SchemaElement> schema, int childrenCount, List<ColumnOrder> columnOrders, int columnCount) { for (int i = 0; i < childrenCount; i++) { SchemaElement schemaElement = schema.next(); // Create Parquet Type. Types.Builder childBuilder; if (schemaElement.type != null) { Types.PrimitiveBuilder primitiveBuilder = builder.primitive( getPrimitive(schemaElement.type), fromParquetRepetition(schemaElement.repetition_type)); if (schemaElement.isSetType_length()) { primitiveBuilder.length(schemaElement.type_length); } if (schemaElement.isSetPrecision()) { primitiveBuilder.precision(schemaElement.precision); } if (schemaElement.isSetScale()) { primitiveBuilder.scale(schemaElement.scale); } if (columnOrders != null) { org.apache.parquet.schema.ColumnOrder columnOrder = fromParquetColumnOrder(columnOrders.get(columnCount)); // As per parquet format 2.4.0 no UNDEFINED order is supported. So, set undefined column order for the types // where ordering is not supported. if (columnOrder.getColumnOrderName() == ColumnOrderName.TYPE_DEFINED_ORDER && (schemaElement.type == Type.INT96 || schemaElement.converted_type == ConvertedType.INTERVAL)) { columnOrder = org.apache.parquet.schema.ColumnOrder.undefined(); } primitiveBuilder.columnOrder(columnOrder); } childBuilder = primitiveBuilder; } else { childBuilder = builder.group(fromParquetRepetition(schemaElement.repetition_type)); buildChildren((Types.GroupBuilder) childBuilder, schema, schemaElement.num_children, columnOrders, columnCount); } if (schemaElement.isSetLogicalType()) { childBuilder.as(getLogicalTypeAnnotation(schemaElement.logicalType)); } if (schemaElement.isSetConverted_type()) { OriginalType originalType = getLogicalTypeAnnotation(schemaElement.converted_type, schemaElement).toOriginalType(); OriginalType newOriginalType = (schemaElement.isSetLogicalType() && getLogicalTypeAnnotation(schemaElement.logicalType) != null) ? getLogicalTypeAnnotation(schemaElement.logicalType).toOriginalType() : null; if (!originalType.equals(newOriginalType)) { if (newOriginalType != null) { LOG.warn("Converted type and logical type metadata mismatch (convertedType: {}, logical type: {}). Using value in converted type.", schemaElement.converted_type, schemaElement.logicalType); } childBuilder.as(originalType); } } if (schemaElement.isSetField_id()) { childBuilder.id(schemaElement.field_id); } childBuilder.named(schemaElement.name); ++columnCount; } }
Example #10
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
LogicalTypeAnnotation getLogicalTypeAnnotation(ConvertedType type, SchemaElement schemaElement) { switch (type) { case UTF8: return LogicalTypeAnnotation.stringType(); case MAP: return LogicalTypeAnnotation.mapType(); case MAP_KEY_VALUE: return LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance(); case LIST: return LogicalTypeAnnotation.listType(); case ENUM: return LogicalTypeAnnotation.enumType(); case DECIMAL: int scale = (schemaElement == null ? 0 : schemaElement.scale); int precision = (schemaElement == null ? 0 : schemaElement.precision); return LogicalTypeAnnotation.decimalType(scale, precision); case DATE: return LogicalTypeAnnotation.dateType(); case TIME_MILLIS: return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); case TIME_MICROS: return LogicalTypeAnnotation.timeType(true, LogicalTypeAnnotation.TimeUnit.MICROS); case TIMESTAMP_MILLIS: return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MILLIS); case TIMESTAMP_MICROS: return LogicalTypeAnnotation.timestampType(true, LogicalTypeAnnotation.TimeUnit.MICROS); case INTERVAL: return LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance(); case INT_8: return LogicalTypeAnnotation.intType(8, true); case INT_16: return LogicalTypeAnnotation.intType(16, true); case INT_32: return LogicalTypeAnnotation.intType(32, true); case INT_64: return LogicalTypeAnnotation.intType(64, true); case UINT_8: return LogicalTypeAnnotation.intType(8, false); case UINT_16: return LogicalTypeAnnotation.intType(16, false); case UINT_32: return LogicalTypeAnnotation.intType(32, false); case UINT_64: return LogicalTypeAnnotation.intType(64, false); case JSON: return LogicalTypeAnnotation.jsonType(); case BSON: return LogicalTypeAnnotation.bsonType(); default: throw new RuntimeException("Can't convert converted type to logical type, unknown converted type " + type); } }
Example #11
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.MapKeyValueTypeAnnotation mapKeyValueLogicalType) { return of(ConvertedType.MAP_KEY_VALUE); }
Example #12
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation intervalLogicalType) { return of(ConvertedType.INTERVAL); }
Example #13
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.BsonLogicalTypeAnnotation bsonLogicalType) { return of(ConvertedType.BSON); }
Example #14
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.JsonLogicalTypeAnnotation jsonLogicalType) { return of(ConvertedType.JSON); }
Example #15
Source File: ParquetReaderUtility.java From Bats with Apache License 2.0 | 4 votes |
/** * Detect corrupt date values by looking at the min/max values in the metadata. * * This should only be used when a file does not have enough metadata to determine if * the data was written with an external tool or an older version of Drill * ({@link org.apache.drill.exec.store.parquet.ParquetRecordWriter#WRITER_VERSION_PROPERTY} < * {@link org.apache.drill.exec.store.parquet.ParquetReaderUtility#DRILL_WRITER_VERSION_STD_DATE_FORMAT}) * * This method only checks the first Row Group, because Drill has only ever written * a single Row Group per file. * * @param footer parquet footer * @param columns list of columns schema path * @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction * of corrupt dates. There are some rare cases (storing dates thousands * of years into the future, with tools other than Drill writing files) * that would result in the date values being "corrected" into bad values. */ public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer, List<SchemaPath> columns, boolean autoCorrectCorruptDates) { // Users can turn-off date correction in cases where we are detecting corruption based on the date values // that are unlikely to appear in common datasets. In this case report that no correction needs to happen // during the file read if (! autoCorrectCorruptDates) { return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION; } // Drill produced files have only ever have a single row group, if this changes in the future it won't matter // as we will know from the Drill version written in the files that the dates are correct int rowGroupIndex = 0; Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer); findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) { List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns(); for (int i = 0; i < parquetColumns.size(); ++i) { ColumnDescriptor column = parquetColumns.get(i); // this reader only supports flat data, this is restricted in the ParquetScanBatchCreator // creating a NameSegment makes sure we are using the standard code for comparing names, // currently it is all case-insensitive if (Utilities.isStarQuery(columns) || getFullColumnPath(column).equalsIgnoreCase(schemaPath.getUnIndexed().toString())) { int colIndex = -1; ConvertedType convertedType = schemaElements.get(getFullColumnPath(column)).getConverted_type(); if (convertedType != null && convertedType.equals(ConvertedType.DATE)) { List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns(); for (int j = 0; j < colChunkList.size(); j++) { if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) { colIndex = j; break; } } } if (colIndex == -1) { // column does not appear in this file, skip it continue; } IntStatistics statistics = (IntStatistics) footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics(); return (statistics.hasNonNullValue() && statistics.compareMaxToValue(ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) > 0) ? DateCorruptionStatus.META_SHOWS_CORRUPTION : DateCorruptionStatus.META_UNCLEAR_TEST_VALUES; } } } return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION; }
Example #16
Source File: TestParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Test public void testLogicalToConvertedTypeConversion() { ParquetMetadataConverter parquetMetadataConverter = new ParquetMetadataConverter(); assertEquals(ConvertedType.UTF8, parquetMetadataConverter.convertToConvertedType(stringType())); assertEquals(ConvertedType.ENUM, parquetMetadataConverter.convertToConvertedType(enumType())); assertEquals(ConvertedType.INT_8, parquetMetadataConverter.convertToConvertedType(intType(8, true))); assertEquals(ConvertedType.INT_16, parquetMetadataConverter.convertToConvertedType(intType(16, true))); assertEquals(ConvertedType.INT_32, parquetMetadataConverter.convertToConvertedType(intType(32, true))); assertEquals(ConvertedType.INT_64, parquetMetadataConverter.convertToConvertedType(intType(64, true))); assertEquals(ConvertedType.UINT_8, parquetMetadataConverter.convertToConvertedType(intType(8, false))); assertEquals(ConvertedType.UINT_16, parquetMetadataConverter.convertToConvertedType(intType(16, false))); assertEquals(ConvertedType.UINT_32, parquetMetadataConverter.convertToConvertedType(intType(32, false))); assertEquals(ConvertedType.UINT_64, parquetMetadataConverter.convertToConvertedType(intType(64, false))); assertEquals(ConvertedType.DECIMAL, parquetMetadataConverter.convertToConvertedType(decimalType(8, 16))); assertEquals(ConvertedType.TIMESTAMP_MILLIS, parquetMetadataConverter.convertToConvertedType(timestampType(true, MILLIS))); assertEquals(ConvertedType.TIMESTAMP_MICROS, parquetMetadataConverter.convertToConvertedType(timestampType(true, MICROS))); assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(true, NANOS))); assertEquals(ConvertedType.TIMESTAMP_MILLIS, parquetMetadataConverter.convertToConvertedType(timestampType(false, MILLIS))); assertEquals(ConvertedType.TIMESTAMP_MICROS, parquetMetadataConverter.convertToConvertedType(timestampType(false, MICROS))); assertNull(parquetMetadataConverter.convertToConvertedType(timestampType(false, NANOS))); assertEquals(ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(true, MILLIS))); assertEquals(ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(true, MICROS))); assertNull(parquetMetadataConverter.convertToConvertedType(timeType(true, NANOS))); assertEquals(ConvertedType.TIME_MILLIS, parquetMetadataConverter.convertToConvertedType(timeType(false, MILLIS))); assertEquals(ConvertedType.TIME_MICROS, parquetMetadataConverter.convertToConvertedType(timeType(false, MICROS))); assertNull(parquetMetadataConverter.convertToConvertedType(timeType(false, NANOS))); assertEquals(ConvertedType.DATE, parquetMetadataConverter.convertToConvertedType(dateType())); assertEquals(ConvertedType.INTERVAL, parquetMetadataConverter.convertToConvertedType(LogicalTypeAnnotation.IntervalLogicalTypeAnnotation.getInstance())); assertEquals(ConvertedType.JSON, parquetMetadataConverter.convertToConvertedType(jsonType())); assertEquals(ConvertedType.BSON, parquetMetadataConverter.convertToConvertedType(bsonType())); assertNull(parquetMetadataConverter.convertToConvertedType(uuidType())); assertEquals(ConvertedType.LIST, parquetMetadataConverter.convertToConvertedType(listType())); assertEquals(ConvertedType.MAP, parquetMetadataConverter.convertToConvertedType(mapType())); assertEquals(ConvertedType.MAP_KEY_VALUE, parquetMetadataConverter.convertToConvertedType(LogicalTypeAnnotation.MapKeyValueTypeAnnotation.getInstance())); }
Example #17
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.DateLogicalTypeAnnotation dateLogicalType) { return of(ConvertedType.DATE); }
Example #18
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.DecimalLogicalTypeAnnotation decimalLogicalType) { return of(ConvertedType.DECIMAL); }
Example #19
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.EnumLogicalTypeAnnotation enumLogicalType) { return of(ConvertedType.ENUM); }
Example #20
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.ListLogicalTypeAnnotation listLogicalType) { return of(ConvertedType.LIST); }
Example #21
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.MapLogicalTypeAnnotation mapLogicalType) { return of(ConvertedType.MAP); }
Example #22
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
@Override public Optional<ConvertedType> visit(LogicalTypeAnnotation.StringLogicalTypeAnnotation stringLogicalType) { return of(ConvertedType.UTF8); }
Example #23
Source File: ParquetMetadataConverter.java From parquet-mr with Apache License 2.0 | 4 votes |
ConvertedType convertToConvertedType(LogicalTypeAnnotation logicalTypeAnnotation) { return logicalTypeAnnotation.accept(CONVERTED_TYPE_CONVERTER_VISITOR).orElse(null); }
Example #24
Source File: ParquetToMinorTypeConverter.java From dremio-oss with Apache License 2.0 | 4 votes |
private static TypeProtos.MinorType getMinorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length, SchemaElement schemaElement, OptionManager options, Field arrowField, final boolean readInt96AsTimeStamp) { ConvertedType convertedType = schemaElement.getConverted_type(); switch (primitiveTypeName) { case BINARY: if (convertedType == null) { return TypeProtos.MinorType.VARBINARY; } switch (convertedType) { case UTF8: return TypeProtos.MinorType.VARCHAR; case DECIMAL: ParquetReaderUtility.checkDecimalTypeEnabled(options); return getDecimalType(schemaElement); default: return TypeProtos.MinorType.VARBINARY; } case INT64: if (convertedType == null) { return TypeProtos.MinorType.BIGINT; } switch(convertedType) { case DECIMAL: ParquetReaderUtility.checkDecimalTypeEnabled(options); return TypeProtos.MinorType.DECIMAL; // TODO - add this back if it is decided to be added upstream, was removed form our pull request July 2014 // case TIME_MICROS: // throw new UnsupportedOperationException(); case TIMESTAMP_MILLIS: return TypeProtos.MinorType.TIMESTAMP; default: throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType)); } case INT32: if (convertedType == null) { return TypeProtos.MinorType.INT; } switch(convertedType) { case DECIMAL: ParquetReaderUtility.checkDecimalTypeEnabled(options); return TypeProtos.MinorType.DECIMAL; case DATE: return TypeProtos.MinorType.DATE; case TIME_MILLIS: return TypeProtos.MinorType.TIME; default: throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType)); } case BOOLEAN: return TypeProtos.MinorType.BIT; case FLOAT: return TypeProtos.MinorType.FLOAT4; case DOUBLE: return TypeProtos.MinorType.FLOAT8; // TODO - Both of these are not supported by the parquet library yet (7/3/13), // but they are declared here for when they are implemented case INT96: if (readInt96AsTimeStamp) { return TypeProtos.MinorType.TIMESTAMP; } else { return TypeProtos.MinorType.VARBINARY; } case FIXED_LEN_BYTE_ARRAY: if (convertedType == null) { checkArgument(length > 0, "A length greater than zero must be provided for a FixedBinary type."); return TypeProtos.MinorType.VARBINARY; } else if (convertedType == ConvertedType.DECIMAL) { ParquetReaderUtility.checkDecimalTypeEnabled(options); return getDecimalType(schemaElement); } else if (convertedType == ConvertedType.INTERVAL) { if (arrowField != null) { if (arrowField.getType().getTypeID() == ArrowTypeID.Interval) { switch (((Interval)arrowField.getType()).getUnit()) { case DAY_TIME: return TypeProtos.MinorType.INTERVALDAY; case YEAR_MONTH: return TypeProtos.MinorType.INTERVALYEAR; } } throw new IllegalArgumentException("incompatible type " + arrowField); } // TODO: older versions of Drill generated this return TypeProtos.MinorType.VARBINARY; } default: throw new UnsupportedOperationException("Type not supported: " + primitiveTypeName); } }
Example #25
Source File: ColumnReaderFactory.java From dremio-oss with Apache License 2.0 | 4 votes |
public static NullableColumnReader<?> getNullableColumnReader(DeprecatedParquetVectorizedReader parentReader, int allocateSize, ColumnDescriptor columnDescriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector valueVec, SchemaElement schemaElement) throws ExecutionSetupException { ConvertedType convertedType = schemaElement.getConverted_type(); if (! columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) { if (columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.INT96) { // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation. if (parentReader.readInt96AsTimeStamp()) { return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (TimeStampMilliVector) valueVec, schemaElement); } else { return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (VarBinaryVector) valueVec, schemaElement); } }else{ return new NullableFixedByteAlignedReaders.NullableFixedByteAlignedReader<>(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, valueVec, schemaElement); } } else { switch (columnDescriptor.getType()) { case INT32: if (convertedType == null) { return new NullableFixedByteAlignedReaders.NullableDictionaryIntReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (IntVector) valueVec, schemaElement); } switch (convertedType) { case DECIMAL: return new NullableFixedByteAlignedReaders.NullableDictionaryDecimal9Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (DecimalVector) valueVec, schemaElement); case TIME_MILLIS: return new NullableFixedByteAlignedReaders.NullableDictionaryTimeReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (TimeMilliVector)valueVec, schemaElement); default: throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT32"); } case INT64: if (convertedType == null) { return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (BigIntVector)valueVec, schemaElement); } switch (convertedType) { case DECIMAL: return new NullableFixedByteAlignedReaders.NullableDictionaryDecimal18Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (DecimalVector)valueVec, schemaElement); case TIMESTAMP_MILLIS: return new NullableFixedByteAlignedReaders.NullableDictionaryTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (TimeStampMilliVector)valueVec, schemaElement); default: throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT64"); } case INT96: // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation. if (parentReader.readInt96AsTimeStamp()) { return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (TimeStampMilliVector) valueVec, schemaElement); } else { return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, true, (VarBinaryVector) valueVec, schemaElement); } case FLOAT: return new NullableFixedByteAlignedReaders.NullableDictionaryFloat4Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (Float4Vector)valueVec, schemaElement); case DOUBLE: return new NullableFixedByteAlignedReaders.NullableDictionaryFloat8Reader(parentReader, allocateSize, columnDescriptor, columnChunkMetaData, fixedLength, (Float8Vector)valueVec, schemaElement); default: throw new ExecutionSetupException("Unsupported nullable column type " + columnDescriptor.getType().name() ); } } }
Example #26
Source File: ParquetReaderUtility.java From dremio-oss with Apache License 2.0 | 4 votes |
/** * Detect corrupt date values by looking at the min/max values in the metadata. * * This should only be used when a file does not have enough metadata to determine if * the data was written with an older version of Drill, or an external tool. Drill * versions 1.3 and beyond should have enough metadata to confirm that the data was written * by Drill. * * This method only checks the first Row Group, because Drill has only ever written * a single Row Group per file. * * @param footer * @param columns * @param autoCorrectCorruptDates user setting to allow enabling/disabling of auto-correction * of corrupt dates. There are some rare cases (storing dates thousands * of years into the future, with tools other than Drill writing files) * that would result in the date values being "corrected" into bad values. */ public static DateCorruptionStatus checkForCorruptDateValuesInStatistics(ParquetMetadata footer, List<SchemaPath> columns, boolean autoCorrectCorruptDates) { // Users can turn-off date correction in cases where we are detecting corruption based on the date values // that are unlikely to appear in common datasets. In this case report that no correction needs to happen // during the file read if (! autoCorrectCorruptDates) { return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION; } // Drill produced files have only ever have a single row group, if this changes in the future it won't matter // as we will know from the Drill version written in the files that the dates are correct int rowGroupIndex = 0; Map<String, SchemaElement> schemaElements = ParquetReaderUtility.getColNameToSchemaElementMapping(footer); findDateColWithStatsLoop : for (SchemaPath schemaPath : columns) { List<ColumnDescriptor> parquetColumns = footer.getFileMetaData().getSchema().getColumns(); for (int i = 0; i < parquetColumns.size(); ++i) { ColumnDescriptor column = parquetColumns.get(i); // this reader only supports flat data, this is restricted in the ParquetScanBatchCreator // creating a NameSegment makes sure we are using the standard code for comparing names, // currently it is all case-insensitive if (ColumnUtils.isStarQuery(columns) || new PathSegment.NameSegment(column.getPath()[0]).equals(schemaPath.getRootSegment())) { int colIndex = -1; ConvertedType convertedType = schemaElements.get(column.getPath()[0]).getConverted_type(); if (convertedType != null && convertedType.equals(ConvertedType.DATE)) { List<ColumnChunkMetaData> colChunkList = footer.getBlocks().get(rowGroupIndex).getColumns(); for (int j = 0; j < colChunkList.size(); j++) { if (colChunkList.get(j).getPath().equals(ColumnPath.get(column.getPath()))) { colIndex = j; break; } } } if (colIndex == -1) { // column does not appear in this file, skip it continue; } Statistics statistics = footer.getBlocks().get(rowGroupIndex).getColumns().get(colIndex).getStatistics(); Integer max = (Integer) statistics.genericGetMax(); if (statistics.hasNonNullValue()) { if (max > ParquetReaderUtility.DATE_CORRUPTION_THRESHOLD) { return DateCorruptionStatus.META_SHOWS_CORRUPTION; } } else { // no statistics, go check the first page return DateCorruptionStatus.META_UNCLEAR_TEST_VALUES; } } } } return DateCorruptionStatus.META_SHOWS_NO_CORRUPTION; }
Example #27
Source File: MetadataReader.java From presto with Apache License 2.0 | 4 votes |
private static OriginalType getOriginalType(ConvertedType type) { switch (type) { case UTF8: return OriginalType.UTF8; case MAP: return OriginalType.MAP; case MAP_KEY_VALUE: return OriginalType.MAP_KEY_VALUE; case LIST: return OriginalType.LIST; case ENUM: return OriginalType.ENUM; case DECIMAL: return OriginalType.DECIMAL; case DATE: return OriginalType.DATE; case TIME_MILLIS: return OriginalType.TIME_MILLIS; case TIMESTAMP_MILLIS: return OriginalType.TIMESTAMP_MILLIS; case INTERVAL: return OriginalType.INTERVAL; case INT_8: return OriginalType.INT_8; case INT_16: return OriginalType.INT_16; case INT_32: return OriginalType.INT_32; case INT_64: return OriginalType.INT_64; case UINT_8: return OriginalType.UINT_8; case UINT_16: return OriginalType.UINT_16; case UINT_32: return OriginalType.UINT_32; case UINT_64: return OriginalType.UINT_64; case JSON: return OriginalType.JSON; case BSON: return OriginalType.BSON; case TIMESTAMP_MICROS: return OriginalType.TIMESTAMP_MICROS; case TIME_MICROS: return OriginalType.TIME_MICROS; default: throw new IllegalArgumentException("Unknown converted type " + type); } }
Example #28
Source File: MessageTypeConverter.java From presto with Apache License 2.0 | 4 votes |
private static ConvertedType getConvertedType(OriginalType type) { switch (type) { case UTF8: return ConvertedType.UTF8; case MAP: return ConvertedType.MAP; case MAP_KEY_VALUE: return ConvertedType.MAP_KEY_VALUE; case LIST: return ConvertedType.LIST; case ENUM: return ConvertedType.ENUM; case DECIMAL: return ConvertedType.DECIMAL; case DATE: return ConvertedType.DATE; case TIME_MILLIS: return ConvertedType.TIME_MILLIS; case TIMESTAMP_MILLIS: return ConvertedType.TIMESTAMP_MILLIS; case INTERVAL: return ConvertedType.INTERVAL; case INT_8: return ConvertedType.INT_8; case INT_16: return ConvertedType.INT_16; case INT_32: return ConvertedType.INT_32; case INT_64: return ConvertedType.INT_64; case UINT_8: return ConvertedType.UINT_8; case UINT_16: return ConvertedType.UINT_16; case UINT_32: return ConvertedType.UINT_32; case UINT_64: return ConvertedType.UINT_64; case JSON: return ConvertedType.JSON; case BSON: return ConvertedType.BSON; default: throw new RuntimeException("Unknown original type " + type); } }
Example #29
Source File: ParquetToDrillTypeConverter.java From Bats with Apache License 2.0 | 4 votes |
private static TypeProtos.MinorType getMinorType(PrimitiveType.PrimitiveTypeName primitiveTypeName, int length, ConvertedType convertedType, OptionManager options) { switch (primitiveTypeName) { case BINARY: if (convertedType == null) { return (TypeProtos.MinorType.VARBINARY); } switch (convertedType) { case UTF8: case ENUM: return (TypeProtos.MinorType.VARCHAR); case DECIMAL: ParquetReaderUtility.checkDecimalTypeEnabled(options); return TypeProtos.MinorType.VARDECIMAL; default: return (TypeProtos.MinorType.VARBINARY); } case INT64: if (convertedType == null) { return (TypeProtos.MinorType.BIGINT); } switch(convertedType) { // DRILL-6670: handle TIMESTAMP_MICROS as INT64 with no logical type case INT_64: case TIMESTAMP_MICROS: return TypeProtos.MinorType.BIGINT; case UINT_64: return TypeProtos.MinorType.UINT8; case DECIMAL: ParquetReaderUtility.checkDecimalTypeEnabled(options); return TypeProtos.MinorType.VARDECIMAL; case TIMESTAMP_MILLIS: return TypeProtos.MinorType.TIMESTAMP; default: throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType)); } case INT32: if (convertedType == null) { return TypeProtos.MinorType.INT; } switch(convertedType) { case UINT_8: case UINT_16: case UINT_32: return TypeProtos.MinorType.UINT4; case INT_8: case INT_16: case INT_32: return TypeProtos.MinorType.INT; case DECIMAL: ParquetReaderUtility.checkDecimalTypeEnabled(options); return TypeProtos.MinorType.VARDECIMAL; case DATE: return TypeProtos.MinorType.DATE; case TIME_MILLIS: return TypeProtos.MinorType.TIME; default: throw new UnsupportedOperationException(String.format("unsupported type: %s %s", primitiveTypeName, convertedType)); } case BOOLEAN: return TypeProtos.MinorType.BIT; case FLOAT: return TypeProtos.MinorType.FLOAT4; case DOUBLE: return TypeProtos.MinorType.FLOAT8; // TODO - Both of these are not supported by the parquet library yet (7/3/13), // but they are declared here for when they are implemented case INT96: if (options.getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) { return TypeProtos.MinorType.TIMESTAMP; } else { return TypeProtos.MinorType.VARBINARY; } case FIXED_LEN_BYTE_ARRAY: if (convertedType == null) { checkArgument(length > 0, "A length greater than zero must be provided for a FixedBinary type."); return TypeProtos.MinorType.VARBINARY; } else if (convertedType == ConvertedType.DECIMAL) { ParquetReaderUtility.checkDecimalTypeEnabled(options); return TypeProtos.MinorType.VARDECIMAL; } else if (convertedType == ConvertedType.INTERVAL) { return TypeProtos.MinorType.INTERVAL; } default: throw new UnsupportedOperationException("Type not supported: " + primitiveTypeName); } }
Example #30
Source File: ColumnReaderFactory.java From Bats with Apache License 2.0 | 4 votes |
public static NullableColumnReader<?> getNullableColumnReader(ParquetRecordReader parentReader, ColumnDescriptor columnDescriptor, ColumnChunkMetaData columnChunkMetaData, boolean fixedLength, ValueVector valueVec, SchemaElement schemaElement) throws ExecutionSetupException { ConvertedType convertedType = schemaElement.getConverted_type(); if (! columnChunkMetaData.getEncodings().contains(Encoding.PLAIN_DICTIONARY)) { if (columnDescriptor.getType() == PrimitiveType.PrimitiveTypeName.INT96) { // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation. if (parentReader.getFragmentContext().getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) { return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableTimeStampVector) valueVec, schemaElement); } else { return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableVarBinaryVector) valueVec, schemaElement); } } else if (convertedType == ConvertedType.DECIMAL) { // NullableVarDecimalVector allows storing of values with different width, // so every time when the value is added, offset vector should be updated. // Therefore NullableVarDecimalReader is used here instead of NullableFixedByteAlignedReader. return new NullableFixedByteAlignedReaders.NullableVarDecimalReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement); } else { return new NullableFixedByteAlignedReaders.NullableFixedByteAlignedReader<>(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, valueVec, schemaElement); } } else { switch (columnDescriptor.getType()) { case INT32: if (convertedType == null) { return new NullableFixedByteAlignedReaders.NullableDictionaryIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableIntVector) valueVec, schemaElement); } switch (convertedType) { case DECIMAL: return new NullableFixedByteAlignedReaders.NullableDictionaryVarDecimalReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement); case TIME_MILLIS: return new NullableFixedByteAlignedReaders.NullableDictionaryTimeReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableTimeVector)valueVec, schemaElement); default: throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT32"); } case INT64: if (convertedType == null) { return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableBigIntVector)valueVec, schemaElement); } switch (convertedType) { case DECIMAL: return new NullableFixedByteAlignedReaders.NullableDictionaryVarDecimalReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableVarDecimalVector) valueVec, schemaElement); case TIMESTAMP_MILLIS: return new NullableFixedByteAlignedReaders.NullableDictionaryTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableTimeStampVector)valueVec, schemaElement); // DRILL-6670: handle TIMESTAMP_MICROS as INT64 with no logical type case TIMESTAMP_MICROS: return new NullableFixedByteAlignedReaders.NullableDictionaryBigIntReader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableBigIntVector)valueVec, schemaElement); default: throw new ExecutionSetupException("Unsupported nullable converted type " + convertedType + " for primitive type INT64"); } case INT96: // TODO: check convertedType once parquet support TIMESTAMP_NANOS type annotation. if (parentReader.getFragmentContext().getOptions().getOption(ExecConstants.PARQUET_READER_INT96_AS_TIMESTAMP).bool_val) { return new NullableFixedByteAlignedReaders.NullableFixedBinaryAsTimeStampReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableTimeStampVector) valueVec, schemaElement); } else { return new NullableFixedByteAlignedReaders.NullableFixedBinaryReader(parentReader, columnDescriptor, columnChunkMetaData, true, (NullableVarBinaryVector) valueVec, schemaElement); } case FLOAT: return new NullableFixedByteAlignedReaders.NullableDictionaryFloat4Reader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableFloat4Vector)valueVec, schemaElement); case DOUBLE: return new NullableFixedByteAlignedReaders.NullableDictionaryFloat8Reader(parentReader, columnDescriptor, columnChunkMetaData, fixedLength, (NullableFloat8Vector)valueVec, schemaElement); default: throw new ExecutionSetupException("Unsupported nullable column type " + columnDescriptor.getType().name() ); } } }