org.apache.iceberg.Schema#findField

Source File: PartitionKey.java From iceberg with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
PartitionKey(PartitionSpec spec, Schema inputSchema) {
  this.spec = spec;

  List<PartitionField> fields = spec.fields();
  this.size = fields.size();
  this.partitionTuple = new Object[size];
  this.transforms = new Transform[size];
  this.accessors = (Accessor<InternalRow>[]) Array.newInstance(Accessor.class, size);

  Schema schema = spec.schema();
  Map<Integer, Accessor<InternalRow>> newAccessors = buildAccessors(inputSchema);
  for (int i = 0; i < size; i += 1) {
    PartitionField field = fields.get(i);
    Accessor<InternalRow> accessor = newAccessors.get(field.sourceId());
    if (accessor == null) {
      throw new RuntimeException(
          "Cannot build accessor for field: " + schema.findField(field.sourceId()));
    }
    this.accessors[i] = accessor;
    this.transforms[i] = field.transform();
  }
}

Source File: NamedReference.java From iceberg with Apache License 2.0

5 votes

@Override
public BoundReference<T> bind(Types.StructType struct, boolean caseSensitive) {
  Schema schema = new Schema(struct.fields());
  Types.NestedField field = caseSensitive ?
      schema.findField(name) :
      schema.caseInsensitiveFindField(name);

  ValidationException.check(field != null,
      "Cannot find field '%s' in struct: %s", name, schema.asStruct());

  return new BoundReference<>(field, schema.accessorForField(field.fieldId()));
}

Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0

5 votes

@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}

Source File: SchemaConverter.java From dremio-oss with Apache License 2.0

5 votes

public static Schema getChildSchemaForStruct(Schema schema, String structName) {
  if (schema == null) {
    return null;
  }

  NestedField structField = schema.findField(structName);
  if (!structField.type().isStructType()) {
    return null;
  }

  return new Schema(structField.type().asStructType().fields());
}

Source File: SchemaConverter.java From dremio-oss with Apache License 2.0

5 votes

public static Schema getChildSchemaForList(Schema schema, String listName) {
  if (schema == null) {
    return null;
  }

  NestedField listField = schema.findField(listName);
  if (!listField.type().isListType()) {
    return null;

  }

  return new Schema(listField.type().asListType().fields().get(0));
}

Source File: IcebergOrcFileWriter.java From presto with Apache License 2.0

4 votes

private static Metrics computeMetrics(Schema icebergSchema, ColumnMetadata<OrcType> orcColumns, long fileRowCount, Optional<ColumnMetadata<ColumnStatistics>> columnStatistics)
{
    if (columnStatistics.isEmpty()) {
        return new Metrics(fileRowCount, null, null, null, null, null);
    }
    // Columns that are descendants of LIST or MAP types are excluded because:
    // 1. Their stats are not used by Apache Iceberg to filter out data files
    // 2. Their record count can be larger than table-level row count. There's no good way to calculate nullCounts for them.
    // See https://github.com/apache/iceberg/pull/199#discussion_r429443627
    Set<OrcColumnId> excludedColumns = getExcludedColumns(orcColumns);

    ImmutableMap.Builder<Integer, Long> valueCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, Long> nullCountsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> lowerBoundsBuilder = ImmutableMap.builder();
    ImmutableMap.Builder<Integer, ByteBuffer> upperBoundsBuilder = ImmutableMap.builder();

    // OrcColumnId(0) is the root column that represents file-level schema
    for (int i = 1; i < orcColumns.size(); i++) {
        OrcColumnId orcColumnId = new OrcColumnId(i);
        if (excludedColumns.contains(orcColumnId)) {
            continue;
        }
        OrcType orcColumn = orcColumns.get(orcColumnId);
        ColumnStatistics orcColumnStats = columnStatistics.get().get(orcColumnId);
        int icebergId = getIcebergId(orcColumn);
        Types.NestedField icebergField = icebergSchema.findField(icebergId);
        verify(icebergField != null, "Cannot find Iceberg column with ID %s in schema %s", icebergId, icebergSchema);
        valueCountsBuilder.put(icebergId, fileRowCount);
        if (orcColumnStats.hasNumberOfValues()) {
            nullCountsBuilder.put(icebergId, fileRowCount - orcColumnStats.getNumberOfValues());
        }
        toIcebergMinMax(orcColumnStats, icebergField.type()).ifPresent(minMax -> {
            lowerBoundsBuilder.put(icebergId, minMax.getMin());
            upperBoundsBuilder.put(icebergId, minMax.getMax());
        });
    }
    Map<Integer, Long> valueCounts = valueCountsBuilder.build();
    Map<Integer, Long> nullCounts = nullCountsBuilder.build();
    Map<Integer, ByteBuffer> lowerBounds = lowerBoundsBuilder.build();
    Map<Integer, ByteBuffer> upperBounds = upperBoundsBuilder.build();
    return new Metrics(
            fileRowCount,
            null, // TODO: Add column size accounting to ORC column writers
            valueCounts.isEmpty() ? null : valueCounts,
            nullCounts.isEmpty() ? null : nullCounts,
            lowerBounds.isEmpty() ? null : lowerBounds,
            upperBounds.isEmpty() ? null : upperBounds);
}

Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0

4 votes

@Test
public void testCustomMetricCollectionForNestedParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.builderFor(COMPLEX_SCHEMA)
      .identity("strCol")
      .build();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  properties.put("write.metadata.metrics.column.longCol", "counts");
  properties.put("write.metadata.metrics.column.record.id", "full");
  properties.put("write.metadata.metrics.column.record.data", "truncate(2)");
  Table table = tables.create(COMPLEX_SCHEMA, spec, properties, tableLocation);

  Iterable<InternalRow> rows = RandomData.generateSpark(COMPLEX_SCHEMA, 10, 0);
  JavaRDD<InternalRow> rdd = sc.parallelize(Lists.newArrayList(rows));
  Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(COMPLEX_SCHEMA), false);

  df.coalesce(1).write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField longCol = schema.findField("longCol");
  Types.NestedField recordId = schema.findField("record.id");
  Types.NestedField recordData = schema.findField("record.data");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();

    Map<Integer, Long> nullValueCounts = file.nullValueCounts();
    Assert.assertEquals(3, nullValueCounts.size());
    Assert.assertTrue(nullValueCounts.containsKey(longCol.fieldId()));
    Assert.assertTrue(nullValueCounts.containsKey(recordId.fieldId()));
    Assert.assertTrue(nullValueCounts.containsKey(recordData.fieldId()));

    Map<Integer, Long> valueCounts = file.valueCounts();
    Assert.assertEquals(3, valueCounts.size());
    Assert.assertTrue(valueCounts.containsKey(longCol.fieldId()));
    Assert.assertTrue(valueCounts.containsKey(recordId.fieldId()));
    Assert.assertTrue(valueCounts.containsKey(recordData.fieldId()));

    Map<Integer, ByteBuffer> lowerBounds = file.lowerBounds();
    Assert.assertEquals(2, lowerBounds.size());
    Assert.assertTrue(lowerBounds.containsKey(recordId.fieldId()));
    ByteBuffer recordDataLowerBound = lowerBounds.get(recordData.fieldId());
    Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataLowerBound).length);

    Map<Integer, ByteBuffer> upperBounds = file.upperBounds();
    Assert.assertEquals(2, upperBounds.size());
    Assert.assertTrue(upperBounds.containsKey(recordId.fieldId()));
    ByteBuffer recordDataUpperBound = upperBounds.get(recordData.fieldId());
    Assert.assertEquals(2, ByteBuffers.toByteArray(recordDataUpperBound).length);
  }
}

Source File: IcebergTableHandler.java From metacat with Apache License 2.0

4 votes

/**
 * Updates the iceberg schema if the provided tableInfo has updated field comments.
 *
 * @param tableInfo table information
 * @return true if an update is done
 */
public boolean update(final TableInfo tableInfo) {
    boolean result = false;
    final List<FieldInfo> fields = tableInfo.getFields();
    if (fields != null && !fields.isEmpty()
        // This parameter is only sent during data change and not during schema change.
        && Strings.isNullOrEmpty(tableInfo.getMetadata().get(DirectSqlTable.PARAM_PREVIOUS_METADATA_LOCATION))) {
        final QualifiedName tableName = tableInfo.getName();
        final String tableMetadataLocation = HiveTableUtil.getIcebergTableMetadataLocation(tableInfo);
        if (Strings.isNullOrEmpty(tableMetadataLocation)) {
            final String message = String.format("No metadata location specified for table %s", tableName);
            log.error(message);
            throw new MetacatBadRequestException(message);
        }
        final IcebergMetastoreTables icebergMetastoreTables = new IcebergMetastoreTables(
            new IcebergTableOps(conf, tableMetadataLocation,
                connectorContext.getConfig(),
                icebergTableOpsProxy));
        final Table table = icebergMetastoreTables.loadTable(
            HiveTableUtil.qualifiedNameToTableIdentifier(tableName));
        final UpdateSchema updateSchema = table.updateSchema();
        final Schema schema = table.schema();
        for (FieldInfo field : fields) {
            final Types.NestedField iField = schema.findField(field.getName());
            if (iField != null && !Objects.equals(field.getComment(), iField.doc())) {
                updateSchema.updateColumnDoc(field.getName(), field.getComment());
                result = true;
            }
        }
        if (result) {
            updateSchema.commit();
            final String newTableMetadataLocation = icebergMetastoreTables.getTableOps().currentMetadataLocation();
            if (!tableMetadataLocation.equalsIgnoreCase(newTableMetadataLocation)) {
                tableInfo.getMetadata().put(DirectSqlTable.PARAM_PREVIOUS_METADATA_LOCATION, tableMetadataLocation);
                tableInfo.getMetadata().put(DirectSqlTable.PARAM_METADATA_LOCATION,
                    newTableMetadataLocation);
            }
        }
    }
    return result;
}

Java Code Examples for org.apache.iceberg.Schema#findField()