Java Code Examples for org.apache.parquet.schema.Type#getRepetition()
The following examples show how to use
org.apache.parquet.schema.Type#getRepetition() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: ParquetRecordWriter.java From dremio-oss with Apache License 2.0 | 6 votes |
/** * Changes the list inner '$data$' vector name to 'element' in the schema */ private Type renameChildTypeToElement(Type childType) { if (childType.isPrimitive()) { PrimitiveType childPrimitiveType = childType.asPrimitiveType(); return new PrimitiveType(childType.getRepetition(), childPrimitiveType.getPrimitiveTypeName(), childPrimitiveType.getTypeLength(), "element", childPrimitiveType.getOriginalType(), childPrimitiveType.getDecimalMetadata(), childPrimitiveType.getId()); } else { GroupType childGroupType = childType.asGroupType(); Type.ID id = childGroupType.getId(); GroupType groupType = new GroupType(childType.getRepetition(), "element", childType.getOriginalType(), childGroupType.getFields()); if (id != null) { groupType = groupType.withId(id.hashCode()); } return groupType; } }
Example 2
Source File: ValidatingRecordConsumer.java From parquet-mr with Apache License 2.0 | 6 votes |
private void validate(PrimitiveTypeName p) { Type currentType = types.peek().asGroupType().getType(fields.peek()); int c = fieldValueCount.pop() + 1; fieldValueCount.push(c); LOG.debug("validate {} for {}",p ,currentType.getName()); switch (currentType.getRepetition()) { case OPTIONAL: case REQUIRED: if (c > 1) { throw new InvalidRecordException("repeated value when the type is not repeated in " + currentType); } break; case REPEATED: break; default: throw new InvalidRecordException("unknown repetition " + currentType.getRepetition() + " in " + currentType); } if (!currentType.isPrimitive() || currentType.asPrimitiveType().getPrimitiveTypeName() != p) { throw new InvalidRecordException("expected type " + p + " but got "+ currentType); } }
Example 3
Source File: ThriftRecordConverter.java From parquet-mr with Apache License 2.0 | 6 votes |
private boolean hasMissingRequiredFieldInGroupType(GroupType requested, GroupType fullSchema) { for (Type field : fullSchema.getFields()) { if (requested.containsField(field.getName())) { Type requestedType = requested.getType(field.getName()); // if a field is in requested schema and the type of it is a group type, then do recursive check if (!field.isPrimitive()) { if (hasMissingRequiredFieldInGroupType(requestedType.asGroupType(), field.asGroupType())) { return true; } else { continue;// check next field } } } else { if (field.getRepetition() == Type.Repetition.REQUIRED) { return true; // if a field is missing in requested schema and it's required } else { continue; // the missing field is not required, then continue checking next field } } } return false; }
Example 4
Source File: DrillParquetReader.java From Bats with Apache License 2.0 | 5 votes |
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), getType(pathSegments, depth + 1, schema)); } }
Example 5
Source File: ParquetResolver.java From pxf with Apache License 2.0 | 5 votes |
private OneField resolvePrimitive(Group group, int columnIndex, Type type, int level) { OneField field = new OneField(); // get type converter based on the primitive type ParquetTypeConverter converter = ParquetTypeConverter.from(type.asPrimitiveType()); // determine how many values for the primitive are present in the column int repetitionCount = group.getFieldRepetitionCount(columnIndex); // at the top level (top field), non-repeated primitives will convert to typed OneField if (level == 0 && type.getRepetition() != REPEATED) { field.type = converter.getDataType(type).getOID(); field.val = repetitionCount == 0 ? null : converter.getValue(group, columnIndex, 0, type); } else if (type.getRepetition() == REPEATED) { // repeated primitive at any level will convert into JSON ArrayNode jsonArray = mapper.createArrayNode(); for (int repeatIndex = 0; repeatIndex < repetitionCount; repeatIndex++) { converter.addValueToJsonArray(group, columnIndex, repeatIndex, type, jsonArray); } // but will become a string only at top level if (level == 0) { field.type = DataType.TEXT.getOID(); try { field.val = mapper.writeValueAsString(jsonArray); } catch (Exception e) { throw new RuntimeException("Failed to serialize repeated parquet type " + type.asPrimitiveType().getName(), e); } } else { // just return the array node within OneField container field.val = jsonArray; } } else { // level > 0 and type != REPEATED -- primitive type as a member of complex group -- NOT YET SUPPORTED throw new UnsupportedOperationException("Parquet complex type support is not yet available."); } return field; }
Example 6
Source File: ParquetRowiseReader.java From dremio-oss with Apache License 2.0 | 5 votes |
private static Type getType(String[] pathSegments, int depth, MessageType schema) { Type type = schema.getType(Arrays.copyOfRange(pathSegments, 0, depth + 1)); if (depth + 1 == pathSegments.length) { return type; } else { Preconditions.checkState(!type.isPrimitive()); return new GroupType(type.getRepetition(), type.getName(), type.getOriginalType(), getType(pathSegments, depth + 1, schema)); } }
Example 7
Source File: ValidatingRecordConsumer.java From parquet-mr with Apache License 2.0 | 5 votes |
private void validate(PrimitiveTypeName... ptypes) { Type currentType = types.peek().asGroupType().getType(fields.peek()); int c = fieldValueCount.pop() + 1; fieldValueCount.push(c); if (LOG.isDebugEnabled()) LOG.debug("validate " + Arrays.toString(ptypes) + " for " + currentType.getName()); switch (currentType.getRepetition()) { case OPTIONAL: case REQUIRED: if (c > 1) { throw new InvalidRecordException("repeated value when the type is not repeated in " + currentType); } break; case REPEATED: break; default: throw new InvalidRecordException("unknown repetition " + currentType.getRepetition() + " in " + currentType); } if (!currentType.isPrimitive()) { throw new InvalidRecordException( "expected type in " + Arrays.toString(ptypes) + " but got " + currentType); } for (PrimitiveTypeName p : ptypes) { if (currentType.asPrimitiveType().getPrimitiveTypeName() == p) { return; // type is valid } } throw new InvalidRecordException( "expected type in " + Arrays.toString(ptypes) + " but got " + currentType); }
Example 8
Source File: ParquetAsTextInputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 5 votes |
protected List<String> groupToStrings(SimpleGroup grp) { ArrayList<String> s = new ArrayList<>(); for (int n = 0; n < grp.getType().getFieldCount(); n ++) { Type field = grp.getType().getType(n); try { if (!field.isPrimitive()) s.addAll(groupToStrings((SimpleGroup) grp.getGroup(n, 0))); // array of groups not (yet) supported else if (field.getRepetition() == Type.Repetition.REPEATED) { boolean is_binary = field.asPrimitiveType().getPrimitiveTypeName() == PrimitiveType.PrimitiveTypeName.BINARY; StringBuilder sb = new StringBuilder("["); ArrayList<String> arr = new ArrayList<>(); for (int i = 0; i < grp.getFieldRepetitionCount(n); i ++) arr.add(is_binary ? "\"" + grp.getValueToString(n, i) + "\"" : grp.getValueToString(n, i)); sb.append(Joiner.on(", ").join(arr)); sb.append("]"); s.add(sb.toString()); } else s.add(grp.getValueToString(n, 0)); } catch (RuntimeException e) { if(e.getMessage().startsWith("not found") && field.getRepetition() == Type.Repetition.OPTIONAL) s.add(""); else throw e; } } return s; }
Example 9
Source File: ParquetAsJsonInputFormat.java From iow-hadoop-streaming with Apache License 2.0 | 4 votes |
private void groupToJson(JsonGenerator currentGenerator, SimpleGroup grp) throws IOException { GroupType gt = grp.getType(); currentGenerator.writeStartObject(); for(int i = 0; i < gt.getFieldCount(); i ++) { String field = gt.getFieldName(i); try { Type t = gt.getType(i); int repetition = 1; boolean repeated = false; if (t.getRepetition() == Type.Repetition.REPEATED) { repeated = true; repetition = grp.getFieldRepetitionCount(i); currentGenerator.writeArrayFieldStart(field); } else currentGenerator.writeFieldName(field); for(int j = 0; j < repetition; j ++) { if (t.isPrimitive()) { switch (t.asPrimitiveType().getPrimitiveTypeName()) { case BINARY: currentGenerator.writeString(grp.getString(i, j)); break; case INT32: currentGenerator.writeNumber(grp.getInteger(i, j)); break; case INT96: case INT64: // clumsy way - TODO - Subclass SimpleGroup or something like that currentGenerator.writeNumber(Long.parseLong(grp.getValueToString(i, j))); break; case DOUBLE: case FLOAT: currentGenerator.writeNumber(Double.parseDouble(grp.getValueToString(i, j))); break; case BOOLEAN: currentGenerator.writeBoolean(grp.getBoolean(i, j)); break; default: throw new RuntimeException("Can't handle type " + gt.getType(i)); } } else { groupToJson(currentGenerator, (SimpleGroup) grp.getGroup(i, j)); } } if (repeated) currentGenerator.writeEndArray(); } catch (Exception e) { if (e.getMessage().startsWith("not found") && gt.getType(i).getRepetition() == Type.Repetition.OPTIONAL) currentGenerator.writeNull(); else throw new RuntimeException(e); } } currentGenerator.writeEndObject(); }