Java Code Examples for org.apache.arrow.vector.types.Types#MinorType
The following examples show how to use
org.apache.arrow.vector.types.Types#MinorType .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: StructArrowValueProjector.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
public StructArrowValueProjector(FieldReader structReader) { this.structReader = requireNonNull(structReader, "structReader is null"); ImmutableMap.Builder<String, Projection> projectionMapBuilder = ImmutableMap.builder(); List<Field> children = structReader.getField().getChildren(); for (Field child : children) { String childName = child.getName(); Types.MinorType minorType = Types.getMinorTypeForArrowType(child.getType()); Projection projection = createValueProjection(minorType); projectionMapBuilder.put(childName, projection); } this.projectionsMap = projectionMapBuilder.build(); }
Example 2
Source File: DDBTypeUtils.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
private static Object coerceDecimalToExpectedType(BigDecimal value, Types.MinorType fieldType) { switch (fieldType) { case INT: case TINYINT: case SMALLINT: return value.intValue(); case BIGINT: return value.longValue(); case FLOAT4: return value.floatValue(); case FLOAT8: return value.doubleValue(); default: return value; } }
Example 3
Source File: ArrowValueProjectorImpl.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
private Projection createComplexValueProjection(Types.MinorType minorType) { switch (minorType) { case LIST: return (fieldReader) -> { ListArrowValueProjector subListProjector = new ListArrowValueProjector(fieldReader); return subListProjector.doProject(); }; case STRUCT: return (fieldReader) -> { StructArrowValueProjector subStructProjector = new StructArrowValueProjector(fieldReader); return subStructProjector.doProject(); }; default: throw new IllegalArgumentException("Unsupported type " + minorType); } }
Example 4
Source File: SchemaUtils.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
/** * Used to merge LIST Field into a single Field. If called with two identical LISTs the output is essentially * the same as either of the inputs. * * @param fieldName The name of the merged Field. * @param curParentField The current field to use as the base for the merge. * @param newParentField The new field to merge into the base. * @return The merged field. */ private static Field mergeListField(String fieldName, Field curParentField, Field newParentField) { //Apache Arrow lists have a special child that holds the concrete type of the list. Types.MinorType newInnerType = Types.getMinorTypeForArrowType(curParentField.getChildren().get(0).getType()); Types.MinorType curInnerType = Types.getMinorTypeForArrowType(newParentField.getChildren().get(0).getType()); if (newInnerType == Types.MinorType.LIST && curInnerType == Types.MinorType.LIST) { return FieldBuilder.newBuilder(fieldName, Types.MinorType.LIST.getType()) .addField(mergeStructField("", curParentField.getChildren().get(0), newParentField.getChildren().get(0))).build(); } else if (curInnerType != newInnerType) { //TODO: currently we resolve fields with mixed types by defaulting to VARCHAR. This is _not_ ideal logger.warn("mergeListField: Encountered a mixed-type list field[{}] {} vs {}, defaulting to String.", fieldName, curInnerType, newInnerType); return FieldBuilder.newBuilder(fieldName, Types.MinorType.LIST.getType()).addStringField("").build(); } return curParentField; }
Example 5
Source File: HbaseRecordHandler.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
/** * Addes the specified Apache Arrow field to the Scan to satisfy the requested projection. * * @param scan The scan object that will be used to read data from HBase. * @param field The field to be added to the scan. */ private void addToProjection(Scan scan, Field field) { //ignore the special 'row' column since we get that by default. if (HbaseSchemaUtils.ROW_COLUMN_NAME.equalsIgnoreCase(field.getName())) { return; } Types.MinorType columnType = Types.getMinorTypeForArrowType(field.getType()); switch (columnType) { case STRUCT: for (Field child : field.getChildren()) { scan.addColumn(field.getName().getBytes(UTF_8), child.getName().getBytes(UTF_8)); } return; default: String[] nameParts = HbaseSchemaUtils.extractColumnParts(field.getName()); if (nameParts.length != 2) { throw new RuntimeException("Column name " + field.getName() + " does not meet family:column hbase convention."); } scan.addColumn(nameParts[0].getBytes(UTF_8), nameParts[1].getBytes(UTF_8)); } }
Example 6
Source File: ElasticsearchFieldResolver.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * Allows for coercion of a list of values where the returned types do not match the schema. * Multiple fields in Elasticsearch can be returned as a string, numeric (Integer, Long, Double), or null. * @param field is the field that we are coercing the value into. * @param fieldValue is the list of value to coerce * @return the coerced list of value. * @throws RuntimeException if the fieldType is not a LIST or the fieldValue is instanceof Map (STRUCT). */ protected Object coerceListField(Field field, Object fieldValue) throws RuntimeException { Types.MinorType fieldType = Types.getMinorTypeForArrowType(field.getType()); switch (fieldType) { case LIST: Field childField = field.getChildren().get(0); if (fieldValue instanceof List) { // Both fieldType and fieldValue are lists => Return as a new list of values, applying coercion // where necessary in order to match the type of the field being mapped into. List<Object> coercedValues = new ArrayList<>(); ((List) fieldValue).forEach(value -> coercedValues.add(coerceField(childField, value))); return coercedValues; } else if (!(fieldValue instanceof Map)) { // This is an abnormal case where the fieldType was defined as a list in the schema, // however, the fieldValue returns as a single value => Return as a list of a single value // applying coercion where necessary in order to match the type of the field being mapped into. return Collections.singletonList(coerceField(childField, fieldValue)); } break; default: break; } throw new RuntimeException("Invalid field value encountered in Document for field: " + field.toString() + ",value: " + fieldValue.toString()); }
Example 7
Source File: DDBTypeUtils.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
private static Object coerceDateTimeToExpectedType(Object value, Types.MinorType fieldType, String customerConfiguredFormat, ZoneId defaultTimeZone) { try { if (value instanceof String) { switch (fieldType) { case DATEMILLI: return DateTimeFormatterUtil.stringToDateTime((String) value, customerConfiguredFormat, defaultTimeZone); case TIMESTAMPMILLITZ: return DateTimeFormatterUtil.stringToZonedDateTime((String) value, customerConfiguredFormat, defaultTimeZone); case DATEDAY: return DateTimeFormatterUtil.stringToLocalDate((String) value, customerConfiguredFormat, defaultTimeZone); default: return value; } } else if (value instanceof BigDecimal) { switch (fieldType) { case DATEMILLI: return DateTimeFormatterUtil.bigDecimalToLocalDateTime((BigDecimal) value, defaultTimeZone); case DATEDAY: return DateTimeFormatterUtil.bigDecimalToLocalDate((BigDecimal) value, defaultTimeZone); default: return value; } } return value; } catch (IllegalArgumentException ex) { ex.printStackTrace(); return value; } }
Example 8
Source File: HbaseSchemaUtils.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * Helper that can coerce the given HBase value to the requested Apache Arrow type. * * @param isNative If True, the HBase value is stored using native bytes. If False, the value is serialized as a String. * @param type The Apache Arrow Type that the value should be coerced to before returning. * @param value The HBase value to coerce. * @return The coerced value which is now allowed with the provided Apache Arrow type. */ public static Object coerceType(boolean isNative, ArrowType type, byte[] value) { if (value == null) { return null; } Types.MinorType minorType = Types.getMinorTypeForArrowType(type); switch (minorType) { case VARCHAR: return Bytes.toString(value); case INT: return isNative ? ByteBuffer.wrap(value).getInt() : Integer.parseInt(Bytes.toString(value)); case BIGINT: return isNative ? ByteBuffer.wrap(value).getLong() : Long.parseLong(Bytes.toString(value)); case FLOAT4: return isNative ? ByteBuffer.wrap(value).getFloat() : Float.parseFloat(Bytes.toString(value)); case FLOAT8: return isNative ? ByteBuffer.wrap(value).getDouble() : Double.parseDouble(Bytes.toString(value)); case BIT: if (isNative) { return (value[0] != 0); } else { return Boolean.parseBoolean(Bytes.toString(value)); } case VARBINARY: return value; default: throw new IllegalArgumentException(type + " with minorType[" + minorType + "] is not supported."); } }
Example 9
Source File: DDBRecordMetadata.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * determines whether the schema contains any type that can be coercible * @param schema Schema to extract out the info from * @return boolean indicating existence of coercible type in schema */ private boolean isContainsCoercibleType(Schema schema) { if (schema != null && schema.getFields() != null) { for (Field field : schema.getFields()) { Types.MinorType fieldType = Types.getMinorTypeForArrowType(field.getType()); if (isDateTimeFieldType(fieldType) || !fieldType.equals(Types.MinorType.DECIMAL)) { return true; } } } return false; }
Example 10
Source File: FieldResolver.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
public Object getFieldValue(Field field, Object value) { Types.MinorType minorType = Types.getMinorTypeForArrowType(field.getType()); if (value instanceof Map) { return ((Map<String, Object>) value).get(field.getName()); } else if (minorType == Types.MinorType.LIST) { return ((List) value).iterator(); } throw new RuntimeException("Expected LIST type but found " + minorType); }
Example 11
Source File: ElasticsearchTypeUtils.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * Create the appropriate field extractor used for extracting field values from a Document based on the field type. * @param field is used to determine which extractor to generate based on the field type. * @return a field extractor. */ protected Extractor makeExtractor(Field field) { Types.MinorType fieldType = Types.getMinorTypeForArrowType(field.getType()); switch (fieldType) { case VARCHAR: return makeVarCharExtractor(field); case BIGINT: return makeBigIntExtractor(field); case INT: return makeIntExtractor(field); case SMALLINT: return makeSmallIntExtractor(field); case TINYINT: return makeTinyIntExtractor(field); case FLOAT8: return makeFloat8Extractor(field); case FLOAT4: return makeFloat4Extractor(field); case DATEMILLI: return makeDateMilliExtractor(field); case BIT: return makeBitExtractor(field); default: return null; } }
Example 12
Source File: DDBRecordMetadata.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * checks if the type is a datetime field type * @param fieldType minorType to be checked * @return boolean true if its one of the three supported datetime types, otherwise false */ public static boolean isDateTimeFieldType(Types.MinorType fieldType) { return fieldType.equals(Types.MinorType.DATEMILLI) || fieldType.equals(Types.MinorType.DATEDAY) || fieldType.equals(Types.MinorType.TIMESTAMPMILLITZ); }
Example 13
Source File: ListArrowValueProjector.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
public ListArrowValueProjector(FieldReader listReader) { this.listReader = requireNonNull(listReader, "listReader is null"); List<Field> children = listReader.getField().getChildren(); if (children.size() != 1) { throw new RuntimeException("Unexpected number of children for ListProjector field " + listReader.getField().getName()); } Types.MinorType minorType = Types.getMinorTypeForArrowType(children.get(0).getType()); projection = createValueProjection(minorType); }
Example 14
Source File: SchemaUtils.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * Used to merge STRUCT Field into a single Field. If called with two identical STRUCTs the output is essentially * the same as either of the inputs. * * @param fieldName The name of the merged Field. * @param curParentField The current field to use as the base for the merge. * @param newParentField The new field to merge into the base. * @return The merged field. */ private static Field mergeStructField(String fieldName, Field curParentField, Field newParentField) { FieldBuilder union = FieldBuilder.newBuilder(fieldName, Types.MinorType.STRUCT.getType()); for (Field nextCur : curParentField.getChildren()) { union.addField(nextCur); } for (Field nextNew : newParentField.getChildren()) { Field curField = union.getChild(nextNew.getName()); if (curField == null) { union.addField(nextNew); continue; } Types.MinorType newType = Types.getMinorTypeForArrowType(nextNew.getType()); Types.MinorType curType = Types.getMinorTypeForArrowType(curField.getType()); if (curType != newType) { //TODO: currently we resolve fields with mixed types by defaulting to VARCHAR. This is _not_ ideal //for various reasons but also because it will cause predicate odities if used in a filter. logger.warn("mergeStructField: Encountered a mixed-type field[{}] {} vs {}, defaulting to String.", nextNew.getName(), newType, curType); union.addStringField(nextNew.getName()); } else if (curType == Types.MinorType.LIST) { union.addField(mergeListField(nextNew.getName(), curField, nextNew)); } else if (curType == Types.MinorType.STRUCT) { union.addField(mergeStructField(nextNew.getName(), curField, nextNew)); } } return union.build(); }
Example 15
Source File: ArrowValueProjectorImpl.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * Concrete implementation of ArrowValueProjectorImpl should invoke thie method to get the Projection instance. * @param minorType * @return Projection used by child class to do actual projection work. */ protected Projection createValueProjection(Types.MinorType minorType) { switch (minorType) { case LIST: case STRUCT: return createComplexValueProjection(minorType); default: return createSimpleValueProjection(minorType); } }
Example 16
Source File: SchemaUtils.java From aws-athena-query-federation with Apache License 2.0 | 4 votes |
/** * This method will produce an Apache Arrow Schema for the given TableName and DocumentDB connection * by scanning up to the requested number of rows and using basic schema inference to determine * data types. * * @param client The DocumentDB connection to use for the scan operation. * @param table The DocumentDB TableName for which to produce an Apache Arrow Schema. * @param numObjToSample The number of records to scan as part of producing the Schema. * @return An Apache Arrow Schema representing the schema of the HBase table. * @note The resulting schema is a union of the schema of every row that is scanned. Presently the code does not * attempt to resolve conflicts if unique field has different types across documents. It is recommend that you * use AWS Glue to define a schema for tables which may have such conflicts. In the future we may enhance this method * to use a reasonable default (like String) and coerce heterogeneous fields to avoid query failure but forcing * explicit handling by defining Schema in AWS Glue is likely a better approach. */ public static Schema inferSchema(MongoClient client, TableName table, int numObjToSample) { MongoDatabase db = client.getDatabase(table.getSchemaName()); int docCount = 0; int fieldCount = 0; try (MongoCursor<Document> docs = db.getCollection(table.getTableName()).find().batchSize(numObjToSample) .maxScan(numObjToSample).limit(numObjToSample).iterator()) { if (!docs.hasNext()) { return SchemaBuilder.newBuilder().build(); } SchemaBuilder schemaBuilder = SchemaBuilder.newBuilder(); while (docs.hasNext()) { docCount++; Document doc = docs.next(); for (String key : doc.keySet()) { fieldCount++; Field newField = getArrowField(key, doc.get(key)); Types.MinorType newType = Types.getMinorTypeForArrowType(newField.getType()); Field curField = schemaBuilder.getField(key); Types.MinorType curType = (curField != null) ? Types.getMinorTypeForArrowType(curField.getType()) : null; if (curField == null) { schemaBuilder.addField(newField); } else if (newType != curType) { //TODO: currently we resolve fields with mixed types by defaulting to VARCHAR. This is _not_ ideal logger.warn("inferSchema: Encountered a mixed-type field[{}] {} vs {}, defaulting to String.", key, curType, newType); schemaBuilder.addStringField(key); } else if (curType == Types.MinorType.LIST) { schemaBuilder.addField(mergeListField(key, curField, newField)); } else if (curType == Types.MinorType.STRUCT) { schemaBuilder.addField(mergeStructField(key, curField, newField)); } } } Schema schema = schemaBuilder.build(); if (schema.getFields().isEmpty()) { throw new RuntimeException("No columns found after scanning " + fieldCount + " values across " + docCount + " documents. Please ensure the collection is not empty and contains at least 1 supported column type."); } return schema; } finally { logger.info("inferSchema: Evaluated {} field values across {} documents.", fieldCount, docCount); } }
Example 17
Source File: ElasticsearchSchemaUtils.java From aws-athena-query-federation with Apache License 2.0 | 4 votes |
/** * Convert the data type from Elasticsearch to Arrow and injects it in a FieldType. * @param mapping is the map containing the Elasticsearch datatype. * @return a new FieldType corresponding to the Elasticsearch type. */ private static FieldType toFieldType(Map<String, Object> mapping) { logger.debug("toFieldType - enter: " + mapping); String elasticType = (String) mapping.get("type"); Types.MinorType minorType; Map<String, String> metadata = new HashMap<>(); switch (elasticType) { case "text": case "keyword": case "binary": minorType = Types.MinorType.VARCHAR; break; case "long": minorType = Types.MinorType.BIGINT; break; case "integer": minorType = Types.MinorType.INT; break; case "short": minorType = Types.MinorType.SMALLINT; break; case "byte": minorType = Types.MinorType.TINYINT; break; case "double": minorType = Types.MinorType.FLOAT8; break; case "scaled_float": // Store the scaling factor in the field's metadata map. minorType = Types.MinorType.BIGINT; metadata.put("scaling_factor", mapping.get("scaling_factor").toString()); break; case "float": case "half_float": minorType = Types.MinorType.FLOAT4; break; case "date": case "date_nanos": minorType = Types.MinorType.DATEMILLI; break; case "boolean": minorType = Types.MinorType.BIT; break; default: minorType = Types.MinorType.NULL; break; } logger.debug("Arrow Type: {}, metadata: {}", minorType.toString(), metadata); return new FieldType(true, minorType.getType(), null, metadata); }
Example 18
Source File: DocumentGenerator.java From aws-athena-query-federation with Apache License 2.0 | 4 votes |
/** * This should be replaced with something that actually reads useful data. */ public static Document makeRandomRow(List<Field> fields, int seed) { Document result = new Document(); for (Field next : fields) { boolean negative = seed % 2 == 1; Types.MinorType minorType = Types.getMinorTypeForArrowType(next.getType()); switch (minorType) { case INT: int iVal = seed * (negative ? -1 : 1); result.put(next.getName(), iVal); break; case TINYINT: case SMALLINT: int stVal = (seed % 4) * (negative ? -1 : 1); result.put(next.getName(), stVal); break; case UINT1: case UINT2: case UINT4: case UINT8: int uiVal = seed % 4; result.put(next.getName(), uiVal); break; case FLOAT4: float fVal = seed * 1.1f * (negative ? -1 : 1); result.put(next.getName(), fVal); break; case FLOAT8: case DECIMAL: double d8Val = seed * 1.1D * (negative ? -1 : 1); result.put(next.getName(), d8Val); break; case BIT: boolean bVal = seed % 2 == 0; result.put(next.getName(), bVal); break; case BIGINT: long lVal = seed * 1L * (negative ? -1 : 1); result.put(next.getName(), lVal); break; case VARCHAR: String vVal = "VarChar" + seed; result.put(next.getName(), vVal); break; case VARBINARY: byte[] binaryVal = ("VarChar" + seed).getBytes(); result.put(next.getName(), binaryVal); break; case STRUCT: result.put(next.getName(), makeRandomRow(next.getChildren(), seed)); break; case LIST: //TODO: pretty dirty way of generating lists should refactor this to support better generation Types.MinorType listType = Types.getMinorTypeForArrowType(next.getChildren().get(0).getType()); switch (listType) { case VARCHAR: List<String> listVarChar = new ArrayList<>(); listVarChar.add("VarChar" + seed); listVarChar.add("VarChar" + seed + 1); result.put(next.getName(), listVarChar); break; case INT: List<Integer> listIVal = new ArrayList<>(); listIVal.add(seed * (negative ? -1 : 1)); listIVal.add(seed * (negative ? -1 : 1) + 1); result.put(next.getName(), listIVal); break; default: throw new RuntimeException(minorType + " is not supported in list"); } break; default: throw new RuntimeException(minorType + " is not supported"); } } return result; }
Example 19
Source File: GeneratedRowWriter.java From aws-athena-query-federation with Apache License 2.0 | 4 votes |
private FieldWriter makeFieldWriter(FieldVector vector) { Field field = vector.getField(); String fieldName = field.getName(); Types.MinorType fieldType = Types.getMinorTypeForArrowType(field.getType()); Extractor extractor = extractors.get(fieldName); ConstraintProjector constraint = constraints.get(fieldName); FieldWriterFactory factory = fieldWriterFactories.get(fieldName); if (factory != null) { return factory.create(vector, extractor, constraint); } if (extractor == null) { throw new IllegalStateException("Missing extractor for field[" + fieldName + "]"); } switch (fieldType) { case INT: return new IntFieldWriter((IntExtractor) extractor, (IntVector) vector, constraint); case BIGINT: return new BigIntFieldWriter((BigIntExtractor) extractor, (BigIntVector) vector, constraint); case DATEMILLI: return new DateMilliFieldWriter((DateMilliExtractor) extractor, (DateMilliVector) vector, constraint); case DATEDAY: return new DateDayFieldWriter((DateDayExtractor) extractor, (DateDayVector) vector, constraint); case TINYINT: return new TinyIntFieldWriter((TinyIntExtractor) extractor, (TinyIntVector) vector, constraint); case SMALLINT: return new SmallIntFieldWriter((SmallIntExtractor) extractor, (SmallIntVector) vector, constraint); case FLOAT4: return new Float4FieldWriter((Float4Extractor) extractor, (Float4Vector) vector, constraint); case FLOAT8: return new Float8FieldWriter((Float8Extractor) extractor, (Float8Vector) vector, constraint); case DECIMAL: return new DecimalFieldWriter((DecimalExtractor) extractor, (DecimalVector) vector, constraint); case BIT: return new BitFieldWriter((BitExtractor) extractor, (BitVector) vector, constraint); case VARCHAR: return new VarCharFieldWriter((VarCharExtractor) extractor, (VarCharVector) vector, constraint); case VARBINARY: return new VarBinaryFieldWriter((VarBinaryExtractor) extractor, (VarBinaryVector) vector, constraint); default: throw new RuntimeException(fieldType + " is not supported"); } }
Example 20
Source File: Block.java From aws-athena-query-federation with Apache License 2.0 | 4 votes |
/** * Provides some basic equality checking for a Block ignoring ordering. This method has some draw backs in that is * isn't a deep equality and will not work for some large complex blocks. At present this method is useful for testing * purposes but may be refactored in a future release. */ public boolean equalsAsSet(Object o) { if (this == o) { return true; } if (o == null || getClass() != o.getClass()) { return false; } Block that = (Block) o; if (this.schema.getFields().size() != that.schema.getFields().size()) { return false; } if (this.vectorSchema.getRowCount() != that.vectorSchema.getRowCount()) { return false; } try { for (Field next : this.schema.getFields()) { FieldReader thisReader = vectorSchema.getVector(next.getName()).getReader(); FieldReader thatReader = that.vectorSchema.getVector(next.getName()).getReader(); for (int i = 0; i < this.vectorSchema.getRowCount(); i++) { thisReader.setPosition(i); Types.MinorType type = thisReader.getMinorType(); Object val = thisReader.readObject(); boolean matched = false; for (int j = 0; j < that.vectorSchema.getRowCount(); j++) { thatReader.setPosition(j); if (ArrowTypeComparator.compare(thatReader, val, thatReader.readObject()) == 0) { matched = true; } } if (!matched) { return false; } } } } catch (RuntimeException ex) { //There are many differences which can cause an exception, easier to handle them this way return false; } return true; }