org.apache.parquet.schema.Type Java Examples
The following examples show how to use
org.apache.parquet.schema.Type.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: AvroRecordConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public AvroCollectionConverter(ParentValueContainer parent, GroupType type, Schema avroSchema, GenericData model, Class<?> containerClass) { this.parent = parent; this.avroSchema = avroSchema; this.containerClass = containerClass; Schema elementSchema = AvroSchemaConverter.getNonNull(avroSchema.getElementType()); Type repeatedType = type.getType(0); // always determine whether the repeated type is the element type by // matching it against the element schema. if (isElementType(repeatedType, elementSchema)) { // the element type is the repeated type (and required) converter = newConverter(elementSchema, repeatedType, model, new ParentValueContainer() { @Override @SuppressWarnings("unchecked") public void add(Object value) { container.add(value); } }); } else { // the element is wrapped in a synthetic group and may be optional converter = new ElementConverter(repeatedType.asGroupType(), elementSchema, model); } }
Example #2
Source File: TestPruneColumnsCommand.java From parquet-mr with Apache License 2.0 | 6 votes |
@Test public void testPruneMultiColumns() throws Exception { // Create Parquet file String inputFile = createParquetFile("input"); String outputFile = createTempFile("output"); // Remove columns String cargs[] = {inputFile, outputFile, "Name", "Gender"}; executeCommandLine(cargs); // Verify the schema are not changed for the columns not pruned ParquetMetadata pmd = ParquetFileReader.readFooter(conf, new Path(outputFile), ParquetMetadataConverter.NO_FILTER); MessageType schema = pmd.getFileMetaData().getSchema(); List<Type> fields = schema.getFields(); assertEquals(fields.size(), 2); assertEquals(fields.get(0).getName(), "DocId"); assertEquals(fields.get(1).getName(), "Links"); List<Type> subFields = fields.get(1).asGroupType().getFields(); assertEquals(subFields.size(), 2); assertEquals(subFields.get(0).getName(), "Backward"); assertEquals(subFields.get(1).getName(), "Forward"); // Verify the data are not changed for the columns not pruned List<String> prunePaths = Arrays.asList("Name", "Gender"); validateColumns(inputFile, prunePaths); }
Example #3
Source File: ThriftSchemaConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
/** * Returns whether the given type is the element type of a list or is a * synthetic group with one field that is the element type. This is * determined by checking whether the type can be a synthetic group and by * checking whether a potential synthetic group matches the expected * ThriftField. * <p> * This method never guesses because the expected ThriftField is known. * * @param repeatedType a type that may be the element type * @param thriftElement the expected Schema for list elements * @return {@code true} if the repeatedType is the element schema */ static boolean isListElementType(Type repeatedType, ThriftField thriftElement) { if (repeatedType.isPrimitive() || (repeatedType.asGroupType().getFieldCount() != 1) || (repeatedType.asGroupType().getType(0).isRepetition(REPEATED))) { // The repeated type must be the element type because it is an invalid // synthetic wrapper. Must be a group with one optional or required field return true; } else if (thriftElement != null && thriftElement.getType() instanceof StructType) { Set<String> fieldNames = new HashSet<String>(); for (ThriftField field : ((StructType) thriftElement.getType()).getChildren()) { fieldNames.add(field.getName()); } // If the repeated type is a subset of the structure of the ThriftField, // then it must be the element type. return fieldNames.contains(repeatedType.asGroupType().getFieldName(0)); } return false; }
Example #4
Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public ElementConverter(String listName, List<TProtocol> listEvents, GroupType repeatedType, ThriftField thriftElement) { this.listEvents = listEvents; this.elementEvents = new ArrayList<TProtocol>(); Type elementType = repeatedType.getType(0); if (elementType.isRepetition(Type.Repetition.OPTIONAL)) { if (ignoreNullElements) { LOG.warn("List " + listName + " has optional elements: null elements are ignored."); } else { throw new ParquetDecodingException("Cannot read list " + listName + " with optional elements: set " + IGNORE_NULL_LIST_ELEMENTS + " to ignore nulls."); } } elementConverter = newConverter(elementEvents, elementType, thriftElement); }
Example #5
Source File: SimpleGroupConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
SimpleGroupConverter(SimpleGroupConverter parent, int index, GroupType schema) { this.parent = parent; this.index = index; converters = new Converter[schema.getFieldCount()]; for (int i = 0; i < converters.length; i++) { final Type type = schema.getType(i); if (type.isPrimitive()) { converters[i] = new SimplePrimitiveConverter(this, i); } else { converters[i] = new SimpleGroupConverter(this, i, type.asGroupType()); } } }
Example #6
Source File: GroupWriter.java From parquet-mr with Apache License 2.0 | 6 votes |
private void writeGroup(Group group, GroupType type) { int fieldCount = type.getFieldCount(); for (int field = 0; field < fieldCount; ++field) { int valueCount = group.getFieldRepetitionCount(field); if (valueCount > 0) { Type fieldType = type.getType(field); String fieldName = fieldType.getName(); recordConsumer.startField(fieldName, field); for (int index = 0; index < valueCount; ++index) { if (fieldType.isPrimitive()) { group.writeValue(field, index, recordConsumer); } else { recordConsumer.startGroup(); writeGroup(group.getGroup(field, index), fieldType.asGroupType()); recordConsumer.endGroup(); } } recordConsumer.endField(fieldName, field); } } }
Example #7
Source File: ProtoWriteSupport.java From parquet-mr with Apache License 2.0 | 6 votes |
private FieldWriter createWriter(FieldDescriptor fieldDescriptor, Type type) { switch (fieldDescriptor.getJavaType()) { case STRING: return new StringWriter() ; case MESSAGE: return createMessageWriter(fieldDescriptor, type); case INT: return new IntWriter(); case LONG: return new LongWriter(); case FLOAT: return new FloatWriter(); case DOUBLE: return new DoubleWriter(); case ENUM: return new EnumWriter(); case BOOLEAN: return new BooleanWriter(); case BYTE_STRING: return new BinaryWriter(); } return unknownType(fieldDescriptor);//should not be executed, always throws exception. }
Example #8
Source File: ParquetSchemaUtil.java From iceberg with Apache License 2.0 | 6 votes |
/** * Prunes columns from a Parquet file schema that was written without field ids. * <p> * Files that were written without field ids are read assuming that schema evolution preserved * column order. Deleting columns was not allowed. * <p> * The order of columns in the resulting Parquet schema matches the Parquet file. * * @param fileSchema schema from a Parquet file that does not have field ids. * @param expectedSchema expected schema * @return a parquet schema pruned using the expected schema */ public static MessageType pruneColumnsFallback(MessageType fileSchema, Schema expectedSchema) { Set<Integer> selectedIds = Sets.newHashSet(); for (Types.NestedField field : expectedSchema.columns()) { selectedIds.add(field.fieldId()); } MessageTypeBuilder builder = org.apache.parquet.schema.Types.buildMessage(); int ordinal = 1; for (Type type : fileSchema.getFields()) { if (selectedIds.contains(ordinal)) { builder.addField(type.withId(ordinal)); } ordinal += 1; } return builder.named(fileSchema.getName()); }
Example #9
Source File: DataWritableGroupConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
public DataWritableGroupConverter(final GroupType selectedGroupType, final HiveGroupConverter parent, final int index, final GroupType containingGroupType) { this.parent = parent; this.index = index; final int totalFieldCount = containingGroupType.getFieldCount(); final int selectedFieldCount = selectedGroupType.getFieldCount(); currentArr = new Object[totalFieldCount]; converters = new Converter[selectedFieldCount]; List<Type> selectedFields = selectedGroupType.getFields(); for (int i = 0; i < selectedFieldCount; i++) { Type subtype = selectedFields.get(i); if (containingGroupType.getFields().contains(subtype)) { converters[i] = getConverterFromDescription(subtype, containingGroupType.getFieldIndex(subtype.getName()), this); } else { throw new IllegalStateException("Group type [" + containingGroupType + "] does not contain requested field: " + subtype); } } }
Example #10
Source File: HiveClientTest.java From garmadon with Apache License 2.0 | 6 votes |
@Test public void createTableWithoutIssue() throws SQLException { PrimitiveType appId = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.BINARY, "app_id"); MessageType schema = new MessageType("fs", appId); String table = "fs"; String location = "file:" + hdfsTemp + "/garmadon_database/fs"; HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon", hdfsTemp + "/garmadon_database"); hiveClient.createTableIfNotExist(table, schema, location); HashMap<String, String> result = getResultHashTableDesc(hiveClient, table); assertEquals(location, result.get("Location")); assertEquals("EXTERNAL_TABLE", result.get("Table Type").trim()); assertEquals("string", result.get("day")); assertEquals("string", result.get("app_id")); }
Example #11
Source File: ParquetGroup.java From incubator-gobblin with Apache License 2.0 | 6 votes |
public String toString(String indent) { StringBuilder result = new StringBuilder(); int i = 0; for (Type field : this.schema.getFields()) { String name = field.getName(); List<Object> values = this.data[i]; for (Object value : values) { result.append(indent).append(name); if (value == null) { result.append(": NULL\n"); } else if (value instanceof Group) { result.append("\n").append(((ParquetGroup) value).toString(indent + " ")); } else { result.append(": ").append(value.toString()).append("\n"); } } i++; } return result.toString(); }
Example #12
Source File: LogicalListL2Converter.java From dremio-oss with Apache License 2.0 | 6 votes |
@Override protected void addChildConverter(String fieldName, OutputMutator mutator, List<Field> arrowSchema, Iterator<SchemaPath> colIterator, Type type, Function<String, String> childNameResolver) { final String nameForChild = "inner"; // Column name to ID mapping creates child entry as 'columnName'.list.element // So, we will append 'list.element' so that name to ID matching works correctly final String fullChildName = fieldName.concat(".").concat("list.element"); if (type.isPrimitive()) { converters.add( getConverterForType(fullChildName, type.asPrimitiveType())); } else { final GroupType groupType = type.asGroupType(); Collection<SchemaPath> c = Lists.newArrayList(colIterator); if (arrowSchema != null) { converters.add( groupConverterFromArrowSchema(fullChildName, "$data$", groupType, c)); } else { converters.add( defaultGroupConverter(fullChildName, mutator, groupType, c, null)); } } }
Example #13
Source File: AvroWriteSupport.java From parquet-mr with Apache License 2.0 | 6 votes |
private void writeRecordFields(GroupType schema, Schema avroSchema, Object record) { List<Type> fields = schema.getFields(); List<Schema.Field> avroFields = avroSchema.getFields(); int index = 0; // parquet ignores Avro nulls, so index may differ for (int avroIndex = 0; avroIndex < avroFields.size(); avroIndex++) { Schema.Field avroField = avroFields.get(avroIndex); if (avroField.schema().getType().equals(Schema.Type.NULL)) { continue; } Type fieldType = fields.get(index); Object value = model.getField(record, avroField.name(), avroIndex); if (value != null) { recordConsumer.startField(fieldType.getName(), index); writeValue(fieldType, avroField.schema(), value); recordConsumer.endField(fieldType.getName(), index); } else if (fieldType.isRepetition(Type.Repetition.REQUIRED)) { throw new RuntimeException("Null-value for required field: " + avroField.name()); } index++; } }
Example #14
Source File: AvroSchemaConverterLogicalTypesPre19.java From datacollector with Apache License 2.0 | 6 votes |
private Schema convertFields(String name, List<Type> parquetFields) { List<Schema.Field> fields = new ArrayList<Schema.Field>(); for (Type parquetType : parquetFields) { Schema fieldSchema = convertField(parquetType); if (parquetType.isRepetition(REPEATED)) { throw new UnsupportedOperationException("REPEATED not supported outside LIST or MAP. Type: " + parquetType); } else if (parquetType.isRepetition(Type.Repetition.OPTIONAL)) { fields.add(new Schema.Field( parquetType.getName(), optional(fieldSchema), null, NullNode.getInstance())); } else { // REQUIRED fields.add(new Schema.Field(parquetType.getName(), fieldSchema, null, null)); } } Schema schema = Schema.createRecord(name, null, null, false); schema.setFields(fields); return schema; }
Example #15
Source File: SimpleGroup.java From parquet-mr with Apache License 2.0 | 6 votes |
private StringBuilder appendToString(StringBuilder builder, String indent) { int i = 0; for (Type field : schema.getFields()) { String name = field.getName(); List<Object> values = data[i]; ++i; if (values != null && !values.isEmpty()) { for (Object value : values) { builder.append(indent).append(name); if (value == null) { builder.append(": NULL\n"); } else if (value instanceof Group) { builder.append('\n'); ((SimpleGroup) value).appendToString(builder, indent + " "); } else { builder.append(": ").append(value.toString()).append('\n'); } } } } return builder; }
Example #16
Source File: ParquetValueReaders.java From iceberg with Apache License 2.0 | 5 votes |
public static <T> ParquetValueReader<T> option(Type type, int definitionLevel, ParquetValueReader<T> reader) { if (type.isRepetition(Type.Repetition.OPTIONAL)) { return new OptionReader<>(definitionLevel, reader); } return reader; }
Example #17
Source File: AvroSchemaConverterLogicalTypesPre19.java From datacollector with Apache License 2.0 | 5 votes |
public MessageType convert(Schema avroSchema) { LOG.info("Using customized AvroSchemaConverter utility to convert: " + avroSchema.toString()); if (!avroSchema.getType().equals(Schema.Type.RECORD)) { throw new IllegalArgumentException("Avro schema must be a record."); } return new MessageType(avroSchema.getFullName(), convertFields(avroSchema.getFields())); }
Example #18
Source File: TajoRecordConverter.java From tajo with Apache License 2.0 | 5 votes |
/** * Creates a new TajoRecordConverter. * * @param parquetSchema The Parquet schema of the projection. * @param tajoReadSchema The Tajo schema of the table. * @param projectionMap An array mapping the projection column to the column * index in the table. */ public TajoRecordConverter(GroupType parquetSchema, Schema tajoReadSchema, int[] projectionMap) { this.parquetSchema = parquetSchema; this.tajoReadSchema = tajoReadSchema; this.projectionMap = projectionMap; this.tupleSize = tajoReadSchema.size(); // The projectionMap.length does not match parquetSchema.getFieldCount() // when the projection contains NULL_TYPE columns. We will skip over the // NULL_TYPE columns when we construct the converters and populate the // NULL_TYPE columns with NullDatums in start(). int index = 0; this.converters = new Converter[parquetSchema.getFieldCount()]; for (int i = 0; i < projectionMap.length; ++i) { final int projectionIndex = projectionMap[i]; Column column = tajoReadSchema.getColumn(projectionIndex); if (column.getDataType().getType() == TajoDataTypes.Type.NULL_TYPE) { continue; } Type type = parquetSchema.getType(index); final int writeIndex = i; converters[index] = newConverter(column, type, new ParentValueContainer() { @Override void add(Object value) { TajoRecordConverter.this.set(writeIndex, value); } }); ++index; } }
Example #19
Source File: ParquetFileAccessor.java From pxf with Apache License 2.0 | 5 votes |
/** * Returns the parquet record filter for the given filter string * * @param filterString the filter string * @param originalFieldsMap a map of field names to types * @param schema the parquet schema * @return the parquet record filter for the given filter string */ private FilterCompat.Filter getRecordFilter(String filterString, Map<String, Type> originalFieldsMap, MessageType schema) { if (StringUtils.isBlank(filterString)) { return FilterCompat.NOOP; } ParquetRecordFilterBuilder filterBuilder = new ParquetRecordFilterBuilder( context.getTupleDescription(), originalFieldsMap); TreeVisitor pruner = new ParquetOperatorPrunerAndTransformer( context.getTupleDescription(), originalFieldsMap, SUPPORTED_OPERATORS); try { // Parse the filter string into a expression tree Node Node root = new FilterParser().parse(filterString); // Prune the parsed tree with valid supported operators and then // traverse the pruned tree with the ParquetRecordFilterBuilder to // produce a record filter for parquet TRAVERSER.traverse(root, pruner, filterBuilder); return filterBuilder.getRecordFilter(); } catch (Exception e) { LOG.error(String.format("%s-%d: %s--%s Unable to generate Parquet Record Filter for filter", context.getTransactionId(), context.getSegmentId(), context.getDataSource(), context.getFilterString()), e); return FilterCompat.NOOP; } }
Example #20
Source File: ParquetTypeVisitor.java From iceberg with Apache License 2.0 | 5 votes |
private static <T> T visitField(Type field, ParquetTypeVisitor<T> visitor) { visitor.fieldNames.push(field.getName()); try { return visit(field, visitor); } finally { visitor.fieldNames.pop(); } }
Example #21
Source File: JsonRecordFormatter.java From parquet-mr with Apache License 2.0 | 5 votes |
@Override protected Object formatResults(List<SimpleRecord> values) { if (super.typeInfo.getRepetition() == Type.Repetition.REPEATED) { List<Object> results = new ArrayList<Object>(); for (SimpleRecord object : values) { results.add(add(object)); } return results; } else { return add(values.get(SINGLE_VALUE)); } }
Example #22
Source File: DataWritableWriter.java From parquet-mr with Apache License 2.0 | 5 votes |
private void writeArray(final ArrayWritable array, final GroupType type) { if (array == null) { return; } final Writable[] subValues = array.get(); final int fieldCount = type.getFieldCount(); for (int field = 0; field < fieldCount; ++field) { final Type subType = type.getType(field); recordConsumer.startField(subType.getName(), field); for (int i = 0; i < subValues.length; ++i) { final Writable subValue = subValues[i]; if (subValue != null) { if (subType.isPrimitive()) { if (subValue instanceof ArrayWritable) { writePrimitive(((ArrayWritable) subValue).get()[field]);// 0 ? } else { writePrimitive(subValue); } } else { if (!(subValue instanceof ArrayWritable)) { throw new RuntimeException("This should be a ArrayWritable: " + subValue); } else { recordConsumer.startGroup(); writeData((ArrayWritable) subValue, subType.asGroupType()); recordConsumer.endGroup(); } } } } recordConsumer.endField(subType.getName(), field); } }
Example #23
Source File: PigSchemaConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
private Type[] convertTypes(Schema pigSchema) { List<FieldSchema> fields = pigSchema.getFields(); Type[] types = new Type[fields.size()]; for (int i = 0; i < types.length; i++) { types[i] = convert(fields.get(i), i); } return types; }
Example #24
Source File: HiveClientTest.java From garmadon with Apache License 2.0 | 5 votes |
@Test(expected = Exception.class) public void shouldThrowExceptionForUnknownParquetType() throws Exception { HiveClient hiveClient = new HiveClient(driverName, "jdbc:hive2://localhost:" + port, "garmadon", hdfsTemp + "/garmadon_database"); PrimitiveType unsupported = new PrimitiveType(Type.Repetition.OPTIONAL, PrimitiveType.PrimitiveTypeName.INT96, "unsupported"); hiveClient.inferHiveType(unsupported); }
Example #25
Source File: TestParquetVectorizedReads.java From iceberg with Apache License 2.0 | 5 votes |
@Test @Override public void testNestedStruct() { AssertHelpers.assertThrows( "Vectorized reads are not supported yet for struct fields", UnsupportedOperationException.class, "Vectorized reads are not supported yet for struct fields", () -> VectorizedSparkParquetReaders.buildReader( TypeUtil.assignIncreasingFreshIds(new Schema(required( 1, "struct", SUPPORTED_PRIMITIVES))), new MessageType("struct", new GroupType(Type.Repetition.OPTIONAL, "struct").withId(1)), false)); }
Example #26
Source File: RowConverter.java From flink with Apache License 2.0 | 5 votes |
ArrayConverter(Type elementType, Class elementClass, TypeInformation elementTypeInfo, ParentDataHolder parentDataHolder, int pos) { this.elementClass = elementClass; this.parentDataHolder = parentDataHolder; this.pos = pos; if (elementClass.equals(Row.class)) { this.elementConverter = createConverter(elementType, 0, elementTypeInfo, this); } else { this.elementConverter = new RowConverter.RowPrimitiveConverter(elementType, this, 0); } }
Example #27
Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0 | 5 votes |
private StructConverter(List<TProtocol> events, GroupType parquetSchema, ThriftField field) { this.events = events; this.name = field.getName(); this.tStruct = new TStruct(name); this.thriftType = (StructType)field.getType(); this.schemaSize = parquetSchema.getFieldCount(); this.converters = new Converter[this.schemaSize]; List<ThriftField> thriftChildren = thriftType.getChildren(); for (int i = 0; i < schemaSize; i++) { Type schemaType = parquetSchema.getType(i); String fieldName = schemaType.getName(); ThriftField matchingThrift = null; for (ThriftField childField: thriftChildren) { String thriftChildName = childField.getName(); if (thriftChildName != null && thriftChildName.equalsIgnoreCase(fieldName)) { matchingThrift = childField; break; } } if (matchingThrift == null) { // this means the file did not contain that field // it will never be populated in this instance // other files might populate it continue; } if (schemaType.isPrimitive()) { converters[i] = new PrimitiveFieldHandler(newConverter(events, schemaType, matchingThrift).asPrimitiveConverter(), matchingThrift, events); } else { converters[i] = new GroupFieldhandler(newConverter(events, schemaType, matchingThrift).asGroupConverter(), matchingThrift, events); } } }
Example #28
Source File: ParquetFileAccessor.java From pxf with Apache License 2.0 | 5 votes |
/** * Opens the resource for read. * * @throws IOException if opening the resource failed */ @Override public boolean openForRead() throws IOException { file = new Path(context.getDataSource()); FileSplit fileSplit = HdfsUtilities.parseFileSplit(context); // Read the original schema from the parquet file MessageType originalSchema = getSchema(file, fileSplit); // Get a map of the column name to Types for the given schema Map<String, Type> originalFieldsMap = getOriginalFieldsMap(originalSchema); // Get the read schema. This is either the full set or a subset (in // case of column projection) of the greenplum schema. MessageType readSchema = buildReadSchema(originalFieldsMap, originalSchema); // Get the record filter in case of predicate push-down FilterCompat.Filter recordFilter = getRecordFilter(context.getFilterString(), originalFieldsMap, readSchema); // add column projection configuration.set(PARQUET_READ_SCHEMA, readSchema.toString()); fileReader = ParquetReader.builder(new GroupReadSupport(), file) .withConf(configuration) // Create reader for a given split, read a range in file .withFileRange(fileSplit.getStart(), fileSplit.getStart() + fileSplit.getLength()) .withFilter(recordFilter) .build(); context.setMetadata(readSchema); return true; }
Example #29
Source File: GenericParquetReaders.java From iceberg with Apache License 2.0 | 5 votes |
@Override public ParquetValueReader<?> list(Types.ListType expectedList, GroupType array, ParquetValueReader<?> elementReader) { GroupType repeated = array.getFields().get(0).asGroupType(); String[] repeatedPath = currentPath(); int repeatedD = type.getMaxDefinitionLevel(repeatedPath)-1; int repeatedR = type.getMaxRepetitionLevel(repeatedPath)-1; Type elementType = repeated.getType(0); int elementD = type.getMaxDefinitionLevel(path(elementType.getName()))-1; return new ListReader<>(repeatedD, repeatedR, option(elementType, elementD, elementReader)); }
Example #30
Source File: ParquetSchemaConverter.java From flink with Apache License 2.0 | 5 votes |
private static TypeInformation<?> convertParquetPrimitiveListToFlinkArray(Type type) { // Backward-compatibility element group doesn't exist also allowed TypeInformation<?> flinkType = convertParquetTypeToTypeInfo(type); if (flinkType.isBasicType()) { return BasicArrayTypeInfo.getInfoFor(Array.newInstance(flinkType.getTypeClass(), 0).getClass()); } else { // flinkType here can be either SqlTimeTypeInfo or BasicTypeInfo.BIG_DEC_TYPE_INFO, // So it should be converted to ObjectArrayTypeInfo return ObjectArrayTypeInfo.getInfoFor(flinkType); } }