org.apache.iceberg.PartitionField Java Exaples

Source File: Projections.java From iceberg with Apache License 2.0

6 votes

@Override
@SuppressWarnings("unchecked")
public <T> Expression predicate(BoundPredicate<T> pred) {
  Collection<PartitionField> parts = spec().getFieldsBySourceId(pred.ref().fieldId());
  if (parts == null) {
    // the predicate has no partition column
    return Expressions.alwaysTrue();
  }

  Expression result = Expressions.alwaysTrue();
  for (PartitionField part : parts) {
    // consider (d = 2019-01-01) with bucket(7, d) and bucket(5, d)
    // projections: b1 = bucket(7, '2019-01-01') = 5, b2 = bucket(5, '2019-01-01') = 0
    // any value where b1 != 5 or any value where b2 != 0 cannot be the '2019-01-01'
    //
    // similarly, if partitioning by day(ts) and hour(ts), the more restrictive
    // projection should be used. ts = 2019-01-01T01:00:00 produces day=2019-01-01 and
    // hour=2019-01-01-01. the value will be in 2019-01-01-01 and not in 2019-01-01-02.
    UnboundPredicate<?> inclusiveProjection = ((Transform<T, ?>) part.transform()).project(part.name(), pred);
    if (inclusiveProjection != null) {
      result = Expressions.and(result, inclusiveProjection);
    }
  }

  return result;
}

Source File: HiveTypeConverter.java From metacat with Apache License 2.0

6 votes

/**
 * Converts iceberg schema to field dto.
 *
 * @param schema          schema
 * @param partitionFields partitioned fields
 * @return list of field Info
 */
public List<FieldInfo> icebergeSchemaTofieldDtos(final Schema schema,
                                                 final List<PartitionField> partitionFields) {
    final List<FieldInfo> fields = Lists.newArrayList();
    final List<String> partitionNames =
        partitionFields.stream()
            .map(f -> schema.findField(f.sourceId()).name()).collect(Collectors.toList());

    for (Types.NestedField field : schema.columns()) {
        final FieldInfo fieldInfo = new FieldInfo();
        fieldInfo.setName(field.name());
        final org.apache.iceberg.types.Type fieldType = field.type();
        fieldInfo.setSourceType(fieldType.toString());
        fieldInfo.setType(toMetacatType(fromIcebergToHiveType(fieldType)));
        fieldInfo.setIsNullable(field.isOptional());
        fieldInfo.setComment(field.doc());
        fieldInfo.setPartitionKey(partitionNames.contains(field.name()));
        fields.add(fieldInfo);
    }

    return fields;
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
PartitionKey(PartitionSpec spec, Schema inputSchema) {
  this.spec = spec;

  List<PartitionField> fields = spec.fields();
  this.size = fields.size();
  this.partitionTuple = new Object[size];
  this.transforms = new Transform[size];
  this.accessors = (Accessor<InternalRow>[]) Array.newInstance(Accessor.class, size);

  Schema schema = spec.schema();
  Map<Integer, Accessor<InternalRow>> newAccessors = buildAccessors(inputSchema);
  for (int i = 0; i < size; i += 1) {
    PartitionField field = fields.get(i);
    Accessor<InternalRow> accessor = newAccessors.get(field.sourceId());
    if (accessor == null) {
      throw new RuntimeException(
          "Cannot build accessor for field: " + schema.findField(field.sourceId()));
    }
    this.accessors[i] = accessor;
    this.transforms[i] = field.transform();
  }
}

Source File: Projections.java From iceberg with Apache License 2.0

6 votes

@Override
@SuppressWarnings("unchecked")
public <T> Expression predicate(BoundPredicate<T> pred) {
  Collection<PartitionField> parts = spec().getFieldsBySourceId(pred.ref().fieldId());
  if (parts == null) {
    // the predicate has no partition column
    return Expressions.alwaysFalse();
  }

  Expression result = Expressions.alwaysFalse();
  for (PartitionField part : parts) {
    // consider (ts > 2019-01-01T01:00:00) with day(ts) and hour(ts)
    // projections: d >= 2019-01-02 and h >= 2019-01-01-02 (note the inclusive bounds).
    // any timestamp where either projection predicate is true must match the original
    // predicate. For example, ts = 2019-01-01T03:00:00 matches the hour projection but not
    // the day, but does match the original predicate.
    UnboundPredicate<?> strictProjection = ((Transform<T, ?>) part.transform()).projectStrict(part.name(), pred);
    if (strictProjection != null) {
      result = Expressions.or(result, strictProjection);
    }
  }

  return result;
}

Source File: ManifestsTable.java From presto with Apache License 2.0

6 votes

private static void writePartitionSummaries(BlockBuilder blockBuilder, List<PartitionFieldSummary> summaries, PartitionSpec partitionSpec)
{
    for (int i = 0; i < summaries.size(); i++) {
        PartitionFieldSummary summary = summaries.get(i);
        PartitionField field = partitionSpec.fields().get(i);
        Type nestedType = partitionSpec.partitionType().fields().get(i).type();

        BlockBuilder rowBuilder = blockBuilder.beginBlockEntry();
        BOOLEAN.writeBoolean(rowBuilder, summary.containsNull());
        VARCHAR.writeString(rowBuilder, field.transform().toHumanString(
                Conversions.fromByteBuffer(nestedType, summary.lowerBound())));
        VARCHAR.writeString(rowBuilder, field.transform().toHumanString(
                Conversions.fromByteBuffer(nestedType, summary.upperBound())));
        blockBuilder.closeEntry();
    }
}

Source File: IcebergUtil.java From presto with Apache License 2.0

5 votes

public static Map<PartitionField, Integer> getIdentityPartitions(PartitionSpec partitionSpec)
{
    // TODO: expose transform information in Iceberg library
    ImmutableMap.Builder<PartitionField, Integer> columns = ImmutableMap.builder();
    for (int i = 0; i < partitionSpec.fields().size(); i++) {
        PartitionField field = partitionSpec.fields().get(i);
        if (field.transform().toString().equals("identity")) {
            columns.put(field, i);
        }
    }
    return columns.build();
}

Source File: PartitionFields.java From presto with Apache License 2.0

5 votes

private static String toPartitionField(PartitionSpec spec, PartitionField field)
{
    String name = spec.schema().findColumnName(field.sourceId());
    String transform = field.transform().toString();

    switch (transform) {
        case "identity":
            return name;
        case "year":
        case "month":
        case "day":
        case "hour":
            return format("%s(%s)", transform, name);
    }

    Matcher matcher = ICEBERG_BUCKET_PATTERN.matcher(transform);
    if (matcher.matches()) {
        return format("bucket(%s, %s)", name, matcher.group(1));
    }

    matcher = ICEBERG_TRUNCATE_PATTERN.matcher(transform);
    if (matcher.matches()) {
        return format("truncate(%s, %s)", name, matcher.group(1));
    }

    throw new UnsupportedOperationException("Unsupported partition transform: " + field);
}

Source File: PartitionSpecVisitor.java From iceberg with Apache License 2.0

5 votes

static <R> List<R> visit(Schema schema, PartitionSpec spec, PartitionSpecVisitor<R> visitor) {
  List<R> results = Lists.newArrayListWithExpectedSize(spec.fields().size());

  for (PartitionField field : spec.fields()) {
    String sourceName = schema.findColumnName(field.sourceId());
    Transform<?, ?> transform = field.transform();

    if (transform instanceof Identity) {
      results.add(visitor.identity(sourceName, field.sourceId()));
    } else if (transform instanceof Bucket) {
      results.add(visitor.bucket(sourceName, field.sourceId(),
          ((Bucket<?>) transform).numBuckets()));
    } else if (transform instanceof Truncate) {
      results.add(visitor.truncate(sourceName, field.sourceId(),
          ((Truncate<?>) transform).width()));
    } else if (transform == Dates.YEAR || transform == Timestamps.YEAR) {
      results.add(visitor.year(sourceName, field.sourceId()));
    } else if (transform == Dates.MONTH || transform == Timestamps.MONTH) {
      results.add(visitor.month(sourceName, field.sourceId()));
    } else if (transform == Dates.DAY || transform == Timestamps.DAY) {
      results.add(visitor.day(sourceName, field.sourceId()));
    } else if (transform == Timestamps.HOUR) {
      results.add(visitor.hour(sourceName, field.sourceId()));
    }
  }

  return results;
}

Source File: PartitionTable.java From presto with Apache License 2.0

5 votes

private List<ColumnMetadata> getPartitionColumnsMetadata(List<PartitionField> fields, Schema schema)
{
    return fields.stream()
            .map(field -> new ColumnMetadata(
                    field.name(),
                    toPrestoType(field.transform().getResultType(schema.findType(field.sourceId())), typeManager)))
            .collect(toImmutableList());
}

Source File: IcebergPageSink.java From presto with Apache License 2.0

5 votes

public PartitionColumn(PartitionField field, int sourceChannel, Type sourceType, Type resultType, Function<Block, Block> blockTransform)
{
    this.field = requireNonNull(field, "field is null");
    this.sourceChannel = sourceChannel;
    this.sourceType = requireNonNull(sourceType, "sourceType is null");
    this.resultType = requireNonNull(resultType, "resultType is null");
    this.blockTransform = requireNonNull(blockTransform, "blockTransform is null");
}

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

5 votes

private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

5 votes

private static List<DataFile> listOrcPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);

    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = OrcMetrics.fromInputFile(HadoopInputFile.fromPath(stat.getPath(), conf));
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("orc")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}

Source File: IcebergSplitSource.java From presto with Apache License 2.0

5 votes

private static Map<Integer, String> getPartitionKeys(FileScanTask scanTask)
{
    StructLike partition = scanTask.file().partition();
    PartitionSpec spec = scanTask.spec();
    Map<PartitionField, Integer> fieldToIndex = getIdentityPartitions(spec);
    Map<Integer, String> partitionKeys = new HashMap<>();

    fieldToIndex.forEach((field, index) -> {
        int id = field.sourceId();
        Type type = spec.schema().findType(id);
        Class<?> javaClass = type.typeId().javaClass();
        Object value = partition.get(index, javaClass);

        if (value == null) {
            partitionKeys.put(id, null);
        }
        else {
            String partitionValue;
            if (type.typeId() == FIXED || type.typeId() == BINARY) {
                // this is safe because Iceberg PartitionData directly wraps the byte array
                partitionValue = new String(((ByteBuffer) value).array(), UTF_8);
            }
            else {
                partitionValue = value.toString();
            }
            partitionKeys.put(id, partitionValue);
        }
    });

    return Collections.unmodifiableMap(partitionKeys);
}

Source File: SparkUtil.java From iceberg with Apache License 2.0

5 votes

/**
 * Check whether the partition transforms in a spec can be used to write data.
 *
 * @param spec a PartitionSpec
 * @throws UnsupportedOperationException if the spec contains unknown partition transforms
 */
public static void validatePartitionTransforms(PartitionSpec spec) {
  if (spec.fields().stream().anyMatch(field -> field.transform() instanceof UnknownTransform)) {
    String unsupported = spec.fields().stream()
        .map(PartitionField::transform)
        .filter(transform -> transform instanceof UnknownTransform)
        .map(Transform::toString)
        .collect(Collectors.joining(", "));

    throw new UnsupportedOperationException(
        String.format("Cannot write using unsupported transforms: %s", unsupported));
  }
}

Source File: DataAdditionCmdHandler.java From dremio-oss with Apache License 2.0

5 votes

public void validateIcebergSchemaForInsertCommand(List<String> fieldNames) {
  IcebergTableProps icebergTableProps = icebergCreateTableEntry.getIcebergTableProps();
  Preconditions.checkState(icebergTableProps.getIcebergOpType() == IcebergOperation.Type.INSERT,
    "unexpected state found");

  BatchSchema querySchema = icebergTableProps.getFullSchema();
  IcebergTableOperations tableOperations = new IcebergTableOperations(
    new org.apache.hadoop.fs.Path(icebergTableProps.getTableLocation()), icebergCreateTableEntry.getPlugin().getFsConfCopy());

  BatchSchema icebergSchema = new SchemaConverter().fromIceberg(
    tableOperations.current().schema());

  // this check can be removed once we support schema evolution in dremio.
  if (!icebergSchema.equalsIgnoreCase(tableSchemaFromKVStore)) {
    throw UserException.validationError().message("The schema for table %s does not match with the iceberg %s.",
      tableSchemaFromKVStore, icebergSchema).buildSilently();
  }

  List<String> icebergPartitionColumns = tableOperations.current().spec().fields().stream()
    .map(PartitionField::name).collect(Collectors.toList());

  // this check can be removed once we support partition spec evolution in dremio.
  if (!comparePartitionColumnLists(icebergPartitionColumns)) {
    throw UserException.validationError().message("The table partition columns %s do not match with the iceberg partition columns %s.",
      partitionColumns.toString(), icebergPartitionColumns.toString()).buildSilently();
  }

  BatchSchema partSchemaWithSelectedFields = tableSchemaFromKVStore.subset(fieldNames).orElse(tableSchemaFromKVStore);
  if (!querySchema.equalsIgnoreCase(partSchemaWithSelectedFields)) {
    throw UserException.validationError().message("Table %s doesn't match with query %s.",
        partSchemaWithSelectedFields, querySchema).buildSilently();
  }
}

Source File: IcebergTableWrapper.java From dremio-oss with Apache License 2.0

5 votes

private void buildPartitionColumns() {
  partitionColumns = table
    .spec()
    .fields()
    .stream()
    .map(PartitionField::sourceId)
    .map(schema::findColumnName) // column name from schema
    .collect(Collectors.toList());
}

Source File: PartitionTable.java From presto with Apache License 2.0

5 votes

private List<Type> partitionTypes(List<PartitionField> partitionFields)
{
    ImmutableList.Builder<Type> partitionTypeBuilder = ImmutableList.builder();
    for (PartitionField partitionField : partitionFields) {
        Type.PrimitiveType sourceType = idToTypeMapping.get(partitionField.sourceId());
        Type type = partitionField.transform().getResultType(sourceType);
        partitionTypeBuilder.add(type);
    }
    return partitionTypeBuilder.build();
}

Source File: IcebergPageSink.java From presto with Apache License 2.0

4 votes

public PartitionField getField()
{
    return field;
}

Source File: ResidualEvaluator.java From iceberg with Apache License 2.0

4 votes

@Override
@SuppressWarnings("unchecked")
public <T> Expression predicate(BoundPredicate<T> pred) {
  // Get the strict projection and inclusive projection of this predicate in partition data,
  // then use them to determine whether to return the original predicate. The strict projection
  // returns true iff the original predicate would have returned true, so the predicate can be
  // eliminated if the strict projection evaluates to true. Similarly the inclusive projection
  // returns false iff the original predicate would have returned false, so the predicate can
  // also be eliminated if the inclusive projection evaluates to false.

  // If there is no strict projection or if it evaluates to false, then return the predicate.
  List<PartitionField> parts = spec.getFieldsBySourceId(pred.ref().fieldId());
  if (parts == null) {
    return pred; // not associated inclusive a partition field, can't be evaluated
  }

  for (PartitionField part : parts) {

    // checking the strict projection
    UnboundPredicate<?> strictProjection = ((Transform<T, ?>) part.transform()).projectStrict(part.name(), pred);
    Expression strictResult = null;

    if (strictProjection != null) {
      Expression bound = strictProjection.bind(spec.partitionType(), caseSensitive);
      if (bound instanceof BoundPredicate) {
        strictResult = super.predicate((BoundPredicate<?>) bound);
      } else {
        // if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse
        strictResult = bound;
      }
    }

    if (strictResult != null && strictResult.op() == Expression.Operation.TRUE) {
      // If strict is true, returning true
      return Expressions.alwaysTrue();
    }

    // checking the inclusive projection
    UnboundPredicate<?> inclusiveProjection = ((Transform<T, ?>) part.transform()).project(part.name(), pred);
    Expression inclusiveResult = null;
    if (inclusiveProjection != null) {
      Expression boundInclusive = inclusiveProjection.bind(spec.partitionType(), caseSensitive);
      if (boundInclusive instanceof BoundPredicate) {
        // using predicate method specific to inclusive
        inclusiveResult = super.predicate((BoundPredicate<?>) boundInclusive);
      } else {
        // if the result is not a predicate, then it must be a constant like alwaysTrue or alwaysFalse
        inclusiveResult = boundInclusive;
      }
    }

    if (inclusiveResult != null && inclusiveResult.op() == Expression.Operation.FALSE) {
      // If inclusive is false, returning false
      return Expressions.alwaysFalse();
    }

  }

  // neither strict not inclusive predicate was conclusive, returning the original pred
  return pred;
}

org.apache.iceberg.PartitionField Java Examples