Java Code Examples for org.apache.orc.TypeDescription#getChildren()
The following examples show how to use
org.apache.orc.TypeDescription#getChildren() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: OrcSchemaConverter.java From pentaho-hadoop-shims with Apache License 2.0 | 6 votes |
public List<IOrcInputField> buildInputFields( TypeDescription typeDescription ) { List<IOrcInputField> inputFields = new ArrayList<IOrcInputField>(); Iterator fieldNameIterator = typeDescription.getFieldNames().iterator(); for ( TypeDescription subDescription : typeDescription.getChildren() ) { //Assume getFieldNames is 1:1 with getChildren String fieldName = (String) fieldNameIterator.next(); int formatType = determineFormatType( subDescription ); if ( formatType != -1 ) { //Skip orc types we do not support int metaType = determineMetaType( subDescription ); if ( metaType == -1 ) { throw new IllegalStateException( "Orc Field Name: " + fieldName + " - Could not find pdi field type for " + subDescription.getCategory() .getName() ); } OrcInputField inputField = new OrcInputField(); inputField.setFormatFieldName( fieldName ); inputField.setFormatType( formatType ); inputField.setPentahoType( metaType ); inputField.setPentahoFieldName( fieldName ); inputFields.add( inputField ); } } return inputFields; }
Example 2
Source File: ORCSchemaUtil.java From iceberg with Apache License 2.0 | 6 votes |
/** * Convert an ORC schema to an Iceberg schema. This method handles the convertion from the original * Iceberg column mapping IDs if present in the ORC column attributes, otherwise, ORC column IDs * will be assigned following ORCs pre-order ID assignment. * * @return the Iceberg schema */ public static Schema convert(TypeDescription orcSchema) { List<TypeDescription> children = orcSchema.getChildren(); List<String> childrenNames = orcSchema.getFieldNames(); Preconditions.checkState(children.size() == childrenNames.size(), "Error in ORC file, children fields and names do not match."); List<Types.NestedField> icebergFields = Lists.newArrayListWithExpectedSize(children.size()); AtomicInteger lastColumnId = new AtomicInteger(getMaxIcebergId(orcSchema)); for (int i = 0; i < children.size(); i++) { icebergFields.add(convertOrcToIceberg(children.get(i), childrenNames.get(i), lastColumnId::incrementAndGet)); } return new Schema(icebergFields); }
Example 3
Source File: VectorColumnFiller.java From secor with Apache License 2.0 | 5 votes |
public UnionColumnConverter(TypeDescription schema) { List<TypeDescription> children = schema.getChildren(); int index = 0; for (TypeDescription childType : children) { JsonType jsonType = getJsonType(childType.getCategory()); JsonConverter converter = createConverter(childType); // FIXME: Handle cases where childConverters is pre-occupied with the same mask childConverters.put(jsonType, new ConverterInfo(index++, converter)); } }
Example 4
Source File: VectorColumnFiller.java From secor with Apache License 2.0 | 5 votes |
public StructColumnConverter(TypeDescription schema) { List<TypeDescription> kids = schema.getChildren(); childrenConverters = new JsonConverter[kids.size()]; for (int c = 0; c < childrenConverters.length; ++c) { childrenConverters[c] = createConverter(kids.get(c)); } fieldNames = schema.getFieldNames(); }
Example 5
Source File: SparkOrcWriter.java From iceberg with Apache License 2.0 | 5 votes |
private static Converter[] buildConverters(TypeDescription schema) { if (schema.getCategory() != TypeDescription.Category.STRUCT) { throw new IllegalArgumentException("Top level must be a struct " + schema); } List<TypeDescription> children = schema.getChildren(); Converter[] result = new Converter[children.size()]; for(int c=0; c < children.size(); ++c) { result[c] = buildConverter(children.get(c)); } return result; }
Example 6
Source File: OrcBatchReader.java From flink with Apache License 2.0 | 5 votes |
/** * Fills an ORC batch into an array of Row. * * @param rows The batch of rows need to be filled. * @param schema The schema of the ORC data. * @param batch The ORC data. * @param selectedFields The list of selected ORC fields. * @return The number of rows that were filled. */ static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) { int rowsToRead = Math.min((int) batch.count(), rows.length); List<TypeDescription> fieldTypes = schema.getChildren(); // read each selected field for (int fieldIdx = 0; fieldIdx < selectedFields.length; fieldIdx++) { int orcIdx = selectedFields[fieldIdx]; readField(rows, fieldIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], rowsToRead); } return rowsToRead; }
Example 7
Source File: SparkOrcWriter.java From iceberg with Apache License 2.0 | 5 votes |
private static Converter[] buildConverters(TypeDescription schema) { if (schema.getCategory() != TypeDescription.Category.STRUCT) { throw new IllegalArgumentException("Top level must be a struct " + schema); } List<TypeDescription> children = schema.getChildren(); Converter[] result = new Converter[children.size()]; for (int c = 0; c < children.size(); ++c) { result[c] = buildConverter(children.get(c)); } return result; }
Example 8
Source File: JsonFieldFiller.java From secor with Apache License 2.0 | 5 votes |
private static void setMap(JSONWriter writer, MapColumnVector vector, TypeDescription schema, int row) throws JSONException { writer.object(); List<TypeDescription> schemaChildren = schema.getChildren(); BytesColumnVector keyVector = (BytesColumnVector) vector.keys; long length = vector.lengths[row]; long offset = vector.offsets[row]; for (int i = 0; i < length; i++) { writer.key(keyVector.toString((int) offset + i)); setValue(writer, vector.values, schemaChildren.get(1), (int) offset + i); } writer.endObject(); }
Example 9
Source File: OrcSchemaWithTypeVisitor.java From iceberg with Apache License 2.0 | 5 votes |
private static <T> T visitRecord( Types.StructType struct, TypeDescription record, OrcSchemaWithTypeVisitor<T> visitor) { List<TypeDescription> fields = record.getChildren(); List<String> names = record.getFieldNames(); List<T> results = Lists.newArrayListWithExpectedSize(fields.size()); for (TypeDescription field : fields) { int fieldId = ORCSchemaUtil.fieldId(field); Types.NestedField iField = struct != null ? struct.field(fieldId) : null; results.add(visit(iField != null ? iField.type() : null, field, visitor)); } return visitor.record(struct, record, names, results); }
Example 10
Source File: ORCSchemaUtil.java From iceberg with Apache License 2.0 | 5 votes |
private static Map<Integer, OrcField> icebergToOrcMapping(String name, TypeDescription orcType) { Map<Integer, OrcField> icebergToOrc = Maps.newHashMap(); switch (orcType.getCategory()) { case STRUCT: List<String> childrenNames = orcType.getFieldNames(); List<TypeDescription> children = orcType.getChildren(); for (int i = 0; i < children.size(); i++) { icebergToOrc.putAll(icebergToOrcMapping(childrenNames.get(i), children.get(i))); } break; case LIST: icebergToOrc.putAll(icebergToOrcMapping("element", orcType.getChildren().get(0))); break; case MAP: icebergToOrc.putAll(icebergToOrcMapping("key", orcType.getChildren().get(0))); icebergToOrc.putAll(icebergToOrcMapping("value", orcType.getChildren().get(1))); break; } if (orcType.getId() > 0) { // Only add to non-root types. icebergID(orcType) .ifPresent(integer -> icebergToOrc.put(integer, new OrcField(name, orcType))); } return icebergToOrc; }
Example 11
Source File: JsonFieldFiller.java From secor with Apache License 2.0 | 5 votes |
private static void setStruct(JSONWriter writer, StructColumnVector batch, TypeDescription schema, int row) throws JSONException { writer.object(); List<String> fieldNames = schema.getFieldNames(); List<TypeDescription> fieldTypes = schema.getChildren(); for (int i = 0; i < fieldTypes.size(); ++i) { writer.key(fieldNames.get(i)); setValue(writer, batch.fields[i], fieldTypes.get(i), row); } writer.endObject(); }
Example 12
Source File: VectorColumnFiller.java From secor with Apache License 2.0 | 5 votes |
public MapColumnConverter(TypeDescription schema) { assertKeyType(schema); List<TypeDescription> childTypes = schema.getChildren(); childConverters = new JsonConverter[childTypes.size()]; for (int c = 0; c < childConverters.length; ++c) { childConverters[c] = createConverter(childTypes.get(c)); } }
Example 13
Source File: OrcBatchReader.java From flink with Apache License 2.0 | 5 votes |
private static void readNonNullStructColumn(Object[] vals, int fieldIdx, StructColumnVector structVector, TypeDescription schema, int childCount) { List<TypeDescription> childrenTypes = schema.getChildren(); int numFields = childrenTypes.size(); // create a batch of Rows to read the structs Row[] structs = new Row[childCount]; // TODO: possible improvement: reuse existing Row objects for (int i = 0; i < childCount; i++) { structs[i] = new Row(numFields); } // read struct fields // we don't have to handle isRepeating because ORC assumes that it is propagated into the children. for (int i = 0; i < numFields; i++) { readField(structs, i, childrenTypes.get(i), structVector.fields[i], childCount); } if (fieldIdx == -1) { // set struct as an object System.arraycopy(structs, 0, vals, 0, childCount); } else { // set struct as a field of Row Row[] rows = (Row[]) vals; for (int i = 0; i < childCount; i++) { rows[i].setField(fieldIdx, structs[i]); } } }
Example 14
Source File: OrcBatchReader.java From flink with Apache License 2.0 | 5 votes |
/** * Fills an ORC batch into an array of Row. * * @param rows The batch of rows need to be filled. * @param schema The schema of the ORC data. * @param batch The ORC data. * @param selectedFields The list of selected ORC fields. * @return The number of rows that were filled. */ static int fillRows(Row[] rows, TypeDescription schema, VectorizedRowBatch batch, int[] selectedFields) { int rowsToRead = Math.min((int) batch.count(), rows.length); List<TypeDescription> fieldTypes = schema.getChildren(); // read each selected field for (int fieldIdx = 0; fieldIdx < selectedFields.length; fieldIdx++) { int orcIdx = selectedFields[fieldIdx]; readField(rows, fieldIdx, fieldTypes.get(orcIdx), batch.cols[orcIdx], rowsToRead); } return rowsToRead; }
Example 15
Source File: JsonFieldFiller.java From secor with Apache License 2.0 | 5 votes |
/** * Writes a single row of union type as a JSON object. * * @throws JSONException */ private static void setUnion(JSONWriter writer, UnionColumnVector vector, TypeDescription schema, int row) throws JSONException { int tag = vector.tags[row]; List<TypeDescription> schemaChildren = schema.getChildren(); ColumnVector columnVector = vector.fields[tag]; setValue(writer, columnVector, schemaChildren.get(tag), row); }
Example 16
Source File: OrcBatchReader.java From flink with Apache License 2.0 | 5 votes |
private static void readNonNullStructColumn(Object[] vals, int fieldIdx, StructColumnVector structVector, TypeDescription schema, int childCount) { List<TypeDescription> childrenTypes = schema.getChildren(); int numFields = childrenTypes.size(); // create a batch of Rows to read the structs Row[] structs = new Row[childCount]; // TODO: possible improvement: reuse existing Row objects for (int i = 0; i < childCount; i++) { structs[i] = new Row(numFields); } // read struct fields // we don't have to handle isRepeating because ORC assumes that it is propagated into the children. for (int i = 0; i < numFields; i++) { readField(structs, i, childrenTypes.get(i), structVector.fields[i], childCount); } if (fieldIdx == -1) { // set struct as an object System.arraycopy(structs, 0, vals, 0, childCount); } else { // set struct as a field of Row Row[] rows = (Row[]) vals; for (int i = 0; i < childCount; i++) { rows[i].setField(fieldIdx, structs[i]); } } }
Example 17
Source File: ORCSchemaUtil.java From iceberg with Apache License 2.0 | 4 votes |
private static Types.NestedField convertOrcToIceberg(TypeDescription orcType, String name, TypeUtil.NextID nextID) { final int icebergID = icebergID(orcType).orElseGet(nextID::get); final boolean isRequired = isRequired(orcType); switch (orcType.getCategory()) { case BOOLEAN: return getIcebergType(icebergID, name, Types.BooleanType.get(), isRequired); case BYTE: case SHORT: case INT: return getIcebergType(icebergID, name, Types.IntegerType.get(), isRequired); case LONG: String longAttributeValue = orcType.getAttributeValue(ICEBERG_LONG_TYPE_ATTRIBUTE); LongType longType = longAttributeValue == null ? LongType.LONG : LongType.valueOf(longAttributeValue); switch (longType) { case TIME: return getIcebergType(icebergID, name, Types.TimeType.get(), isRequired); case LONG: return getIcebergType(icebergID, name, Types.LongType.get(), isRequired); default: throw new IllegalStateException("Invalid Long type found in ORC type attribute"); } case FLOAT: return getIcebergType(icebergID, name, Types.FloatType.get(), isRequired); case DOUBLE: return getIcebergType(icebergID, name, Types.DoubleType.get(), isRequired); case STRING: case CHAR: case VARCHAR: return getIcebergType(icebergID, name, Types.StringType.get(), isRequired); case BINARY: String binaryAttributeValue = orcType.getAttributeValue(ICEBERG_BINARY_TYPE_ATTRIBUTE); BinaryType binaryType = binaryAttributeValue == null ? BinaryType.BINARY : BinaryType.valueOf(binaryAttributeValue); switch (binaryType) { case UUID: return getIcebergType(icebergID, name, Types.UUIDType.get(), isRequired); case FIXED: int fixedLength = Integer.parseInt(orcType.getAttributeValue(ICEBERG_FIELD_LENGTH)); return getIcebergType(icebergID, name, Types.FixedType.ofLength(fixedLength), isRequired); case BINARY: return getIcebergType(icebergID, name, Types.BinaryType.get(), isRequired); default: throw new IllegalStateException("Invalid Binary type found in ORC type attribute"); } case DATE: return getIcebergType(icebergID, name, Types.DateType.get(), isRequired); case TIMESTAMP: return getIcebergType(icebergID, name, Types.TimestampType.withoutZone(), isRequired); case TIMESTAMP_INSTANT: return getIcebergType(icebergID, name, Types.TimestampType.withZone(), isRequired); case DECIMAL: return getIcebergType(icebergID, name, Types.DecimalType.of(orcType.getPrecision(), orcType.getScale()), isRequired); case STRUCT: { List<String> fieldNames = orcType.getFieldNames(); List<TypeDescription> fieldTypes = orcType.getChildren(); List<Types.NestedField> fields = new ArrayList<>(fieldNames.size()); for (int c = 0; c < fieldNames.size(); ++c) { String childName = fieldNames.get(c); TypeDescription type = fieldTypes.get(c); Types.NestedField field = convertOrcToIceberg(type, childName, nextID); fields.add(field); } return getIcebergType(icebergID, name, Types.StructType.of(fields), isRequired); } case LIST: { TypeDescription elementType = orcType.getChildren().get(0); Types.NestedField element = convertOrcToIceberg(elementType, "element", nextID); Types.ListType listTypeWithElem = isRequired(elementType) ? Types.ListType.ofRequired(element.fieldId(), element.type()) : Types.ListType.ofOptional(element.fieldId(), element.type()); return isRequired ? Types.NestedField.required(icebergID, name, listTypeWithElem) : Types.NestedField.optional(icebergID, name, listTypeWithElem); } case MAP: { TypeDescription keyType = orcType.getChildren().get(0); Types.NestedField key = convertOrcToIceberg(keyType, "key", nextID); TypeDescription valueType = orcType.getChildren().get(1); Types.NestedField value = convertOrcToIceberg(valueType, "value", nextID); Types.MapType mapTypeWithKV = isRequired(valueType) ? Types.MapType.ofRequired(key.fieldId(), value.fieldId(), key.type(), value.type()) : Types.MapType.ofOptional(key.fieldId(), value.fieldId(), key.type(), value.type()); return getIcebergType(icebergID, name, mapTypeWithKV, isRequired); } default: // We don't have an answer for union types. throw new IllegalArgumentException("Can't handle " + orcType); } }
Example 18
Source File: OrcBatchReader.java From flink with Apache License 2.0 | 4 votes |
/** * Converts an ORC schema to a Flink TypeInformation. * * @param schema The ORC schema. * @return The TypeInformation that corresponds to the ORC schema. */ static TypeInformation schemaToTypeInfo(TypeDescription schema) { switch (schema.getCategory()) { case BOOLEAN: return BasicTypeInfo.BOOLEAN_TYPE_INFO; case BYTE: return BasicTypeInfo.BYTE_TYPE_INFO; case SHORT: return BasicTypeInfo.SHORT_TYPE_INFO; case INT: return BasicTypeInfo.INT_TYPE_INFO; case LONG: return BasicTypeInfo.LONG_TYPE_INFO; case FLOAT: return BasicTypeInfo.FLOAT_TYPE_INFO; case DOUBLE: return BasicTypeInfo.DOUBLE_TYPE_INFO; case DECIMAL: return BasicTypeInfo.BIG_DEC_TYPE_INFO; case STRING: case CHAR: case VARCHAR: return BasicTypeInfo.STRING_TYPE_INFO; case DATE: return SqlTimeTypeInfo.DATE; case TIMESTAMP: return SqlTimeTypeInfo.TIMESTAMP; case BINARY: return PrimitiveArrayTypeInfo.BYTE_PRIMITIVE_ARRAY_TYPE_INFO; case STRUCT: List<TypeDescription> fieldSchemas = schema.getChildren(); TypeInformation[] fieldTypes = new TypeInformation[fieldSchemas.size()]; for (int i = 0; i < fieldSchemas.size(); i++) { fieldTypes[i] = schemaToTypeInfo(fieldSchemas.get(i)); } String[] fieldNames = schema.getFieldNames().toArray(new String[]{}); return new RowTypeInfo(fieldTypes, fieldNames); case LIST: TypeDescription elementSchema = schema.getChildren().get(0); TypeInformation<?> elementType = schemaToTypeInfo(elementSchema); // arrays of primitive types are handled as object arrays to support null values return ObjectArrayTypeInfo.getInfoFor(elementType); case MAP: TypeDescription keySchema = schema.getChildren().get(0); TypeDescription valSchema = schema.getChildren().get(1); TypeInformation<?> keyType = schemaToTypeInfo(keySchema); TypeInformation<?> valType = schemaToTypeInfo(valSchema); return new MapTypeInfo<>(keyType, valType); case UNION: throw new UnsupportedOperationException("UNION type is not supported yet."); default: throw new IllegalArgumentException("Unknown type " + schema); } }
Example 19
Source File: OrcBatchReader.java From flink with Apache License 2.0 | 4 votes |
@SuppressWarnings("unchecked") private static Function<Object, Object> getCopyFunction(TypeDescription schema) { // check the type of the vector to decide how to read it. switch (schema.getCategory()) { case BOOLEAN: case BYTE: case SHORT: case INT: case LONG: case FLOAT: case DOUBLE: case CHAR: case VARCHAR: case STRING: case DECIMAL: return OrcBatchReader::returnImmutable; case DATE: return OrcBatchReader::copyDate; case TIMESTAMP: return OrcBatchReader::copyTimestamp; case BINARY: return OrcBatchReader::copyBinary; case STRUCT: List<TypeDescription> fieldTypes = schema.getChildren(); Function<Object, Object>[] copyFields = new Function[fieldTypes.size()]; for (int i = 0; i < fieldTypes.size(); i++) { copyFields[i] = getCopyFunction(fieldTypes.get(i)); } return new CopyStruct(copyFields); case LIST: TypeDescription entryType = schema.getChildren().get(0); Function<Object, Object> copyEntry = getCopyFunction(entryType); Class entryClass = getClassForType(entryType); return new CopyList(copyEntry, entryClass); case MAP: TypeDescription keyType = schema.getChildren().get(0); TypeDescription valueType = schema.getChildren().get(1); Function<Object, Object> copyKey = getCopyFunction(keyType); Function<Object, Object> copyValue = getCopyFunction(valueType); return new CopyMap(copyKey, copyValue); case UNION: throw new UnsupportedOperationException("UNION type not supported yet"); default: throw new IllegalArgumentException("Unknown type " + schema); } }
Example 20
Source File: OrcTestUtils.java From incubator-gobblin with Apache License 2.0 | 4 votes |
/** * Fill in value in OrcStruct with given schema, assuming {@param w} contains the same schema as {@param schema}. * {@param schema} is still necessary to given given {@param w} do contains schema information itself, because the * actual value type is only available in {@link TypeDescription} but not {@link org.apache.orc.mapred.OrcValue}. * * For simplicity here are some assumptions: * - We only give 3 primitive values and use them to construct compound values. To make it work for different types that * can be widened or shrunk to each other, please use value within small range. * - For List, Map or Union, make sure there's at least one entry within the record-container. * you may want to try createValueRecursively(TypeDescription) instead of {@link OrcStruct#createValue(TypeDescription)} */ public static void fillOrcStructWithFixedValue(WritableComparable w, TypeDescription schema, int unionTag, int intValue, String stringValue, boolean booleanValue) { switch (schema.getCategory()) { case BOOLEAN: ((BooleanWritable) w).set(booleanValue); break; case BYTE: ((ByteWritable) w).set((byte) intValue); break; case SHORT: ((ShortWritable) w).set((short) intValue); break; case INT: ((IntWritable) w).set(intValue); break; case LONG: ((LongWritable) w).set(intValue); break; case FLOAT: ((FloatWritable) w).set(intValue * 1.0f); break; case DOUBLE: ((DoubleWritable) w).set(intValue * 1.0); break; case STRING: case CHAR: case VARCHAR: ((Text) w).set(stringValue); break; case BINARY: throw new UnsupportedOperationException("Binary type is not supported in random orc data filler"); case DECIMAL: throw new UnsupportedOperationException("Decimal type is not supported in random orc data filler"); case DATE: case TIMESTAMP: case TIMESTAMP_INSTANT: throw new UnsupportedOperationException( "Timestamp and its derived types is not supported in random orc data filler"); case LIST: OrcList castedList = (OrcList) w; // Here it is not trivial to create typed-object in element-type. So this method expect the value container // to at least contain one element, or the traversing within the list will be skipped. for (Object i : castedList) { fillOrcStructWithFixedValue((WritableComparable) i, schema.getChildren().get(0), unionTag, intValue, stringValue, booleanValue); } break; case MAP: OrcMap castedMap = (OrcMap) w; for (Object entry : castedMap.entrySet()) { Map.Entry<WritableComparable, WritableComparable> castedEntry = (Map.Entry<WritableComparable, WritableComparable>) entry; fillOrcStructWithFixedValue(castedEntry.getKey(), schema.getChildren().get(0), unionTag, intValue, stringValue, booleanValue); fillOrcStructWithFixedValue(castedEntry.getValue(), schema.getChildren().get(1), unionTag, intValue, stringValue, booleanValue); } break; case STRUCT: OrcStruct castedStruct = (OrcStruct) w; int fieldIdx = 0; for (TypeDescription child : schema.getChildren()) { fillOrcStructWithFixedValue(castedStruct.getFieldValue(fieldIdx), child, unionTag, intValue, stringValue, booleanValue); fieldIdx += 1; } break; case UNION: OrcUnion castedUnion = (OrcUnion) w; TypeDescription targetMemberSchema = schema.getChildren().get(unionTag); castedUnion.set(unionTag, OrcUtils.createValueRecursively(targetMemberSchema)); fillOrcStructWithFixedValue((WritableComparable) castedUnion.getObject(), targetMemberSchema, unionTag, intValue, stringValue, booleanValue); break; default: throw new IllegalArgumentException("Unknown type " + schema.toString()); } }