org.apache.parquet.schema.GroupType Java Examples
The following examples show how to use
org.apache.parquet.schema.GroupType.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SimpleGroupConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
SimpleGroupConverter(SimpleGroupConverter parent, int index, GroupType schema) { this.parent = parent; this.index = index; converters = new Converter[schema.getFieldCount()]; for (int i = 0; i < converters.length; i++) { final Type type = schema.getType(i); if (type.isPrimitive()) { converters[i] = new SimplePrimitiveConverter(this, i); } else { converters[i] = new SimpleGroupConverter(this, i, type.asGroupType()); } } }
Example #2
Source File: ParquetRecordWriter.java From Bats with Apache License 2.0 | 6 votes |
private Type getType(MaterializedField field) { MinorType minorType = field.getType().getMinorType(); DataMode dataMode = field.getType().getMode(); switch (minorType) { case MAP: List<Type> types = Lists.newArrayList(); for (MaterializedField childField : field.getChildren()) { types.add(getType(childField)); } return new GroupType(dataMode == DataMode.REPEATED ? Repetition.REPEATED : Repetition.OPTIONAL, field.getName(), types); case LIST: throw new UnsupportedOperationException("Unsupported type " + minorType); case NULL: MaterializedField newField = field.withType( TypeProtos.MajorType.newBuilder().setMinorType(MinorType.INT).setMode(DataMode.OPTIONAL).build()); return getPrimitiveType(newField); default: return getPrimitiveType(field); } }
Example #3
Source File: AvroWriteSupport.java From parquet-mr with Apache License 2.0 | 6 votes |
private void writeRecordFields(GroupType schema, Schema avroSchema, Object record) { List<Type> fields = schema.getFields(); List<Schema.Field> avroFields = avroSchema.getFields(); int index = 0; // parquet ignores Avro nulls, so index may differ for (int avroIndex = 0; avroIndex < avroFields.size(); avroIndex++) { Schema.Field avroField = avroFields.get(avroIndex); if (avroField.schema().getType().equals(Schema.Type.NULL)) { continue; } Type fieldType = fields.get(index); Object value = model.getField(record, avroField.name(), avroIndex); if (value != null) { recordConsumer.startField(fieldType.getName(), index); writeValue(fieldType, avroField.schema(), value); recordConsumer.endField(fieldType.getName(), index); } else if (fieldType.isRepetition(Type.Repetition.REQUIRED)) { throw new RuntimeException("Null-value for required field: " + avroField.name()); } index++; } }
Example #4
Source File: TestParquetPredicateUtils.java From presto with Apache License 2.0 | 6 votes |
@Test public void testParquetTupleDomainStruct() { RowType rowType = rowType( RowType.field("a", INTEGER), RowType.field("b", INTEGER)); HiveColumnHandle columnHandle = createBaseColumn("my_struct", 0, HiveType.valueOf("struct<a:int,b:int>"), rowType, REGULAR, Optional.empty()); TupleDomain<HiveColumnHandle> domain = withColumnDomains(ImmutableMap.of(columnHandle, Domain.notNull(rowType))); MessageType fileSchema = new MessageType("hive_schema", new GroupType(OPTIONAL, "my_struct", new PrimitiveType(OPTIONAL, INT32, "a"), new PrimitiveType(OPTIONAL, INT32, "b"))); Map<List<String>, RichColumnDescriptor> descriptorsByPath = getDescriptors(fileSchema, fileSchema); TupleDomain<ColumnDescriptor> tupleDomain = getParquetTupleDomain(descriptorsByPath, domain, fileSchema, true); assertTrue(tupleDomain.isAll()); }
Example #5
Source File: AvroWriteSupportInt96Avro18.java From datacollector with Apache License 2.0 | 6 votes |
@Override protected void writeObjectArray(GroupType type, Schema schema, Object[] array) { if (array.length > 0) { recordConsumer.startField(OLD_LIST_REPEATED_NAME, 0); try { for (Object element : array) { writeValue(type.getType(0), schema.getElementType(), element); } } catch (NullPointerException e) { // find the null element and throw a better error message for (int i = 0; i < array.length; i += 1) { if (array[i] == null) { throw new NullPointerException( "Array contains a null element at " + i + "\n" + "Set parquet.avro.write-old-list-structure=false to turn " + "on support for arrays with null elements."); } } // no element was null, throw the original exception throw e; } recordConsumer.endField(OLD_LIST_REPEATED_NAME, 0); } }
Example #6
Source File: ParquetGroupConverter.java From dremio-oss with Apache License 2.0 | 6 votes |
ParquetGroupConverter( ParquetColumnResolver columnResolver, OutputMutator mutator, GroupType schema, Collection<SchemaPath> columns, OptionManager options, List<Field> arrowSchema, Function<String, String> childNameResolver, SchemaDerivationHelper schemaHelper) { this.converters = Lists.newArrayList(); this.mutator = mutator; this.schema = schema; this.columns = columns; this.options = options; this.arrowSchema = arrowSchema; this.childNameResolver = childNameResolver; this.schemaHelper = schemaHelper; this.columnResolver = columnResolver; this.maxFieldSizeLimit = Math.toIntExact(options.getOption(ExecConstants.LIMIT_FIELD_SIZE_BYTES)); }
Example #7
Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public ElementConverter(String listName, List<TProtocol> listEvents, GroupType repeatedType, ThriftField thriftElement) { this.listEvents = listEvents; this.elementEvents = new ArrayList<TProtocol>(); Type elementType = repeatedType.getType(0); if (elementType.isRepetition(Type.Repetition.OPTIONAL)) { if (ignoreNullElements) { LOG.warn("List " + listName + " has optional elements: null elements are ignored."); } else { throw new ParquetDecodingException("Cannot read list " + listName + " with optional elements: set " + IGNORE_NULL_LIST_ELEMENTS + " to ignore nulls."); } } elementConverter = newConverter(elementEvents, elementType, thriftElement); }
Example #8
Source File: SingleLevelArrayMapKeyValuesSchemaConverter.java From presto with Apache License 2.0 | 6 votes |
public static GroupType mapType(Repetition repetition, String alias, String mapAlias, Type keyType, Type valueType) { //support projection only on key of a map if (valueType == null) { return listWrapper( repetition, alias, MAP_KEY_VALUE, new GroupType( Repetition.REPEATED, mapAlias, keyType)); } if (!valueType.getName().equals("value")) { throw new RuntimeException(valueType.getName() + " should be value"); } return listWrapper( repetition, alias, MAP_KEY_VALUE, new GroupType( Repetition.REPEATED, mapAlias, keyType, valueType)); }
Example #9
Source File: ProtoMessageConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public ListConverter(Message.Builder parentBuilder, Descriptors.FieldDescriptor fieldDescriptor, Type parquetType) { LogicalTypeAnnotation logicalTypeAnnotation = parquetType.getLogicalTypeAnnotation(); if (!(logicalTypeAnnotation instanceof LogicalTypeAnnotation.ListLogicalTypeAnnotation) || parquetType.isPrimitive()) { throw new ParquetDecodingException("Expected LIST wrapper. Found: " + logicalTypeAnnotation + " instead."); } GroupType rootWrapperType = parquetType.asGroupType(); if (!rootWrapperType.containsField("list") || rootWrapperType.getType("list").isPrimitive()) { throw new ParquetDecodingException("Expected repeated 'list' group inside LIST wrapperr but got: " + rootWrapperType); } GroupType listType = rootWrapperType.getType("list").asGroupType(); if (!listType.containsField("element")) { throw new ParquetDecodingException("Expected 'element' inside repeated list group but got: " + listType); } Type elementType = listType.getType("element"); converter = newMessageConverter(parentBuilder, fieldDescriptor, elementType); }
Example #10
Source File: JsonElementConversionFactory.java From incubator-gobblin with Apache License 2.0 | 6 votes |
private Type buildSchema() { JsonArray inputSchema = this.jsonSchema.getDataTypeValues(); List<Type> parquetTypes = new ArrayList<>(); for (JsonElement element : inputSchema) { JsonObject map = (JsonObject) element; JsonSchema elementSchema = new JsonSchema(map); String columnName = elementSchema.getColumnName(); JsonElementConverter converter = JsonElementConversionFactory.getConverter(elementSchema, false); Type schemaType = converter.schema(); this.converters.put(columnName, converter); parquetTypes.add(schemaType); } String docName = this.jsonSchema.getColumnName(); switch (recordType) { case ROOT: return new MessageType(docName, parquetTypes); case CHILD: return new GroupType(optionalOrRequired(this.jsonSchema), docName, parquetTypes); default: throw new RuntimeException("Unsupported Record type"); } }
Example #11
Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0 | 6 votes |
/** * Changes the list inner '$data$' vector name to 'element' in the schema */ private Type renameChildTypeToElement(Type childType) { if (childType.isPrimitive()) { PrimitiveType childPrimitiveType = childType.asPrimitiveType(); return new PrimitiveType(childType.getRepetition(), childPrimitiveType.getPrimitiveTypeName(), childPrimitiveType.getTypeLength(), "element", childPrimitiveType.getOriginalType(), childPrimitiveType.getDecimalMetadata(), childPrimitiveType.getId()); } else { GroupType childGroupType = childType.asGroupType(); Type.ID id = childGroupType.getId(); GroupType groupType = new GroupType(childType.getRepetition(), "element", childType.getOriginalType(), childGroupType.getFields()); if (id != null) { groupType = groupType.withId(id.hashCode()); } return groupType; } }
Example #12
Source File: DataWritableGroupConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public DataWritableGroupConverter(final GroupType selectedGroupType, final HiveGroupConverter parent, final int index, final GroupType containingGroupType) { this.parent = parent; this.index = index; final int totalFieldCount = containingGroupType.getFieldCount(); final int selectedFieldCount = selectedGroupType.getFieldCount(); currentArr = new Object[totalFieldCount]; converters = new Converter[selectedFieldCount]; List<Type> selectedFields = selectedGroupType.getFields(); for (int i = 0; i < selectedFieldCount; i++) { Type subtype = selectedFields.get(i); if (containingGroupType.getFields().contains(subtype)) { converters[i] = getConverterFromDescription(subtype, containingGroupType.getFieldIndex(subtype.getName()), this); } else { throw new IllegalStateException("Group type [" + containingGroupType + "] does not contain requested field: " + subtype); } } }
Example #13
Source File: TestDataWritableWriter.java From presto with Apache License 2.0 | 6 votes |
private void writeSingleLevelArray(Object value, ListObjectInspector inspector, GroupType type) { // Get the internal array structure Type elementType = type.getType(0); recordConsumer.startGroup(); List<?> arrayValues = inspector.getList(value); if (!arrayValues.isEmpty()) { recordConsumer.startField(elementType.getName(), 0); ObjectInspector elementInspector = inspector.getListElementObjectInspector(); for (Object element : arrayValues) { if (element == null) { throw new IllegalArgumentException("Array elements are requires in given schema definition"); } writeValue(element, elementInspector, elementType); } recordConsumer.endField(elementType.getName(), 0); } recordConsumer.endGroup(); }
Example #14
Source File: MessageTypeToType.java From iceberg with Apache License 2.0 | 6 votes |
@Override public Type list(GroupType array, Type elementType) { GroupType repeated = array.getType(0).asGroupType(); org.apache.parquet.schema.Type element = repeated.getType(0); Preconditions.checkArgument( !element.isRepetition(Repetition.REPEATED), "Elements cannot have repetition REPEATED: %s", element); int elementFieldId = getId(element); addAlias(element.getName(), elementFieldId); if (element.isRepetition(Repetition.OPTIONAL)) { return Types.ListType.ofOptional(elementFieldId, elementType); } else { return Types.ListType.ofRequired(elementFieldId, elementType); } }
Example #15
Source File: AvroIndexedRecordConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public AvroArrayConverter(ParentValueContainer parent, GroupType type, Schema avroSchema, GenericData model) { this.parent = parent; this.avroSchema = avroSchema; Schema elementSchema = AvroSchemaConverter .getNonNull(avroSchema.getElementType()); Type repeatedType = type.getType(0); // always determine whether the repeated type is the element type by // matching it against the element schema. if (AvroRecordConverter.isElementType(repeatedType, elementSchema)) { // the element type is the repeated type (and required) converter = newConverter(elementSchema, repeatedType, model, new ParentValueContainer() { @Override @SuppressWarnings("unchecked") public void add(Object value) { array.add(value); } }); } else { // the element is wrapped in a synthetic group and may be optional converter = new ElementConverter(repeatedType.asGroupType(), elementSchema, model); } }
Example #16
Source File: AvroRecordConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public AvroCollectionConverter(ParentValueContainer parent, GroupType type, Schema avroSchema, GenericData model, Class<?> containerClass) { this.parent = parent; this.avroSchema = avroSchema; this.containerClass = containerClass; Schema elementSchema = AvroSchemaConverter.getNonNull(avroSchema.getElementType()); Type repeatedType = type.getType(0); // always determine whether the repeated type is the element type by // matching it against the element schema. if (isElementType(repeatedType, elementSchema)) { // the element type is the repeated type (and required) converter = newConverter(elementSchema, repeatedType, model, new ParentValueContainer() { @Override @SuppressWarnings("unchecked") public void add(Object value) { container.add(value); } }); } else { // the element is wrapped in a synthetic group and may be optional converter = new ElementConverter(repeatedType.asGroupType(), elementSchema, model); } }
Example #17
Source File: TupleWriter.java From hadoop-etl-udfs with MIT License | 6 votes |
private void writeTuple(Tuple tuple, GroupType type) { for (int index = 0; index < type.getFieldCount(); index++) { Type fieldType = type.getType(index); String fieldName = fieldType.getName(); // empty fields have to be omitted if (tuple.isNull(index)) continue; recordConsumer.startField(fieldName, index); if (fieldType.isPrimitive()) { tuple.writePrimitiveValue(recordConsumer, index, (PrimitiveType)fieldType); } else { recordConsumer.startGroup(); writeTuple(tuple.getTuple(index), fieldType.asGroupType()); recordConsumer.endGroup(); } recordConsumer.endField(fieldName, index); } }
Example #18
Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private boolean hasMissingRequiredFieldInGroupType(GroupType requested, GroupType fullSchema) { for (Type field : fullSchema.getFields()) { if (requested.containsField(field.getName())) { Type requestedType = requested.getType(field.getName()); // if a field is in requested schema and the type of it is a group type, then do recursive check if (!field.isPrimitive()) { if (hasMissingRequiredFieldInGroupType(requestedType.asGroupType(), field.asGroupType())) { return true; } else { continue;// check next field } } } else { if (field.getRepetition() == Type.Repetition.REQUIRED) { return true; // if a field is missing in requested schema and it's required } else { continue; // the missing field is not required, then continue checking next field } } } return false; }
Example #19
Source File: Metadata.java From Bats with Apache License 2.0 | 6 votes |
private ColTypeInfo getColTypeInfo(MessageType schema, Type type, String[] path, int depth) { if (type.isPrimitive()) { PrimitiveType primitiveType = (PrimitiveType) type; int precision = 0; int scale = 0; if (primitiveType.getDecimalMetadata() != null) { precision = primitiveType.getDecimalMetadata().getPrecision(); scale = primitiveType.getDecimalMetadata().getScale(); } int repetitionLevel = schema.getMaxRepetitionLevel(path); int definitionLevel = schema.getMaxDefinitionLevel(path); return new ColTypeInfo(type.getOriginalType(), precision, scale, repetitionLevel, definitionLevel); } Type t = ((GroupType) type).getType(path[depth]); return getColTypeInfo(schema, t, path, depth + 1); }
Example #20
Source File: VectorizedSparkParquetReaders.java From iceberg with Apache License 2.0 | 5 votes |
@Override public VectorizedReader<?> struct( Types.StructType expected, GroupType groupType, List<VectorizedReader<?>> fieldReaders) { if (expected != null) { throw new UnsupportedOperationException("Vectorized reads are not supported yet for struct fields"); } return null; }
Example #21
Source File: AvroIndexedRecordConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
public MapConverter(ParentValueContainer parent, GroupType mapType, Schema mapSchema, GenericData model) { this.parent = parent; GroupType repeatedKeyValueType = mapType.getType(0).asGroupType(); this.keyValueConverter = new MapKeyValueConverter( repeatedKeyValueType, mapSchema, model); }
Example #22
Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
MapConverter(List<TProtocol> parentEvents, GroupType parquetSchema, ThriftField field) { this.parentEvents = parentEvents; if (parquetSchema.getFieldCount() != 1) { throw new IllegalArgumentException("maps have only one field. " + parquetSchema + " size = " + parquetSchema.getFieldCount()); } Type nestedType = parquetSchema.getType(0); final ThriftField key = ((MapType)field.getType()).getKey(); keyType = key.getType().getType().getThriftType(); final ThriftField value = ((MapType)field.getType()).getValue(); valueType = value.getType().getType().getThriftType(); child = new GroupCounter(new MapKeyValueConverter(mapEvents, nestedType, key, value)); }
Example #23
Source File: ParquetAvroValueReaders.java From iceberg with Apache License 2.0 | 5 votes |
@Override public ParquetValueReader<?> list(Types.ListType expectedList, GroupType array, ParquetValueReader<?> elementReader) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath)-1; int repeatedR = type.getMaxRepetitionLevel(repeatedPath)-1; Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName()))-1; return new ListReader<>(repeatedD, repeatedR, option(elementType, elementD, elementReader)); }
Example #24
Source File: ParquetAvroWriter.java From iceberg with Apache License 2.0 | 5 votes |
@Override public ParquetValueWriter<?> list(GroupType array, ParquetValueWriter<?> elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); org.apache.parquet.schema.Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())); return ParquetValueWriters.collections(repeatedD, repeatedR, ParquetValueWriters.option(elementType, elementD, elementWriter)); }
Example #25
Source File: AvroWriteSupportInt96Avro17.java From datacollector with Apache License 2.0 | 5 votes |
private void writeUnion(GroupType parquetSchema, Schema avroSchema, Object value) { recordConsumer.startGroup(); // ResolveUnion will tell us which of the union member types to // deserialise. int avroIndex = model.resolveUnion(avroSchema, value); // For parquet's schema we skip nulls GroupType parquetGroup = parquetSchema.asGroupType(); int parquetIndex = avroIndex; for (int i = 0; i < avroIndex; i++) { if (avroSchema.getTypes().get(i).getType().equals(Schema.Type.NULL)) { parquetIndex--; } } // Sparsely populated method of encoding unions, each member has its own // set of columns. String memberName = "member" + parquetIndex; recordConsumer.startField(memberName, parquetIndex); writeValue(parquetGroup.getType(parquetIndex), avroSchema.getTypes().get(avroIndex), value); recordConsumer.endField(memberName, parquetIndex); recordConsumer.endGroup(); }
Example #26
Source File: SparkParquetReaders.java From iceberg with Apache License 2.0 | 5 votes |
@Override public ParquetValueReader<?> list(Types.ListType expectedList, GroupType array, ParquetValueReader<?> elementReader) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath) - 1; int repeatedR = type.getMaxRepetitionLevel(repeatedPath) - 1; Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())) - 1; return new ArrayReader<>(repeatedD, repeatedR, ParquetValueReaders.option(elementType, elementD, elementReader)); }
Example #27
Source File: LogicalListL1Converter.java From dremio-oss with Apache License 2.0 | 5 votes |
LogicalListL1Converter( ParquetColumnResolver columnResolver, String fieldName, OutputMutator mutator, final WriterProvider writerProvider, GroupType schema, Collection<SchemaPath> columns, OptionManager options, List<Field> arrowSchema, SchemaDerivationHelper schemaHelper) { String listFieldName = columnResolver.getBatchSchemaColumnName(fieldName); listWriter = writerProvider.list(ParquetGroupConverter.getNameForChild(listFieldName)); if (!isSupportedSchema(schema)) { throw UserException.dataReadError() .message("Unsupported LOGICAL LIST parquet schema") .addContext("schema", schema) .build(logger); } final GroupType groupType = schema.getFields().get(0).asGroupType(); converter = new LogicalListL2Converter( columnResolver, fieldName, new ListWriterProvider(listWriter), mutator, groupType, columns, options, arrowSchema, schemaHelper); }
Example #28
Source File: SparkParquetWriters.java From iceberg with Apache License 2.0 | 5 votes |
@Override public ParquetValueWriter<?> list(ArrayType sArray, GroupType array, ParquetValueWriter<?> elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); return new ArrayDataWriter<>(repeatedD, repeatedR, newOption(repeated.getType(0), elementWriter), sArray.elementType()); }
Example #29
Source File: GenericParquetWriter.java From iceberg with Apache License 2.0 | 5 votes |
@Override public ParquetValueWriter<?> list(GroupType array, ParquetValueWriter<?> elementWriter) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath); int repeatedR = type.getMaxRepetitionLevel(repeatedPath); org.apache.parquet.schema.Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName())); return ParquetValueWriters.collections(repeatedD, repeatedR, ParquetValueWriters.option(elementType, elementD, elementWriter)); }
Example #30
Source File: RowConverter.java From flink with Apache License 2.0 | 5 votes |
MapKeyValueConverter(GroupType groupType, MapTypeInfo typeInfo) { this.keyConverter = createConverter( groupType.getType(0), 0, typeInfo.getKeyTypeInfo(), (fieldIndex, object) -> key = object); this.valueConverter = createConverter( groupType.getType(1), 1, typeInfo.getValueTypeInfo(), (fieldIndex, object) -> value = object); }