org.apache.iceberg.Table Java Exaples

Source File: TestSnapshotSelection.java From iceberg with Apache License 2.0

7 votes

@Test(expected = IllegalArgumentException.class)
public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, tableLocation);

  List<SimpleRecord> firstBatchRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class);
  firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation);

  long timestamp = System.currentTimeMillis();
  long snapshotId = table.currentSnapshot().snapshotId();
  Dataset<Row> df = spark.read()
      .format("iceberg")
      .option("snapshot-id", snapshotId)
      .option("as-of-timestamp", timestamp)
      .load(tableLocation);

  df.collectAsList();
}

Source File: TestHadoopCommits.java From iceberg with Apache License 2.0

6 votes

@Test
public void testStaleMetadata() throws Exception {
  Table tableCopy = TABLES.load(tableLocation);

  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  // prepare changes on the copy without committing
  UpdateSchema updateCopy = tableCopy.updateSchema()
      .addColumn("m", Types.IntegerType.get());
  updateCopy.apply();

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertNotEquals("Unmodified copy should be out of date after update",
      table.schema().asStruct(), tableCopy.schema().asStruct());

  // update the table
  tableCopy.refresh();

  Assert.assertEquals("Copy should be back in sync",
      table.schema().asStruct(), tableCopy.schema().asStruct());

  AssertHelpers.assertThrows("Should fail with stale base metadata",
      CommitFailedException.class, "based on stale table metadata", updateCopy::commit);

  List<File> manifests = listManifestFiles();
  Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size());
}

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

RewriteManifestsAction(SparkSession spark, Table table) {
  this.spark = spark;
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class);
  this.table = table;
  this.spec = table.spec();
  this.targetManifestSizeBytes = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.MANIFEST_TARGET_SIZE_BYTES,
      TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT);
  this.fileIO = SparkUtil.serializableFileIO(table);

  // default the staging location to the metadata location
  TableOperations ops = ((HasTableOperations) table).operations();
  Path metadataFilePath = new Path(ops.metadataFileLocation("file"));
  this.stagingLocation = metadataFilePath.getParent().toString();

  // use the current table format version for new manifests
  this.formatVersion = ops.current().formatVersion();
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) {
  File location = new File(parent, desc);
  Table byId = TABLES.create(SCHEMA, spec, location.toString());

  // Do not combine or split files because the tests expect a split per partition.
  // A target split size of 2048 helps us achieve that.
  byId.updateProperties().set("read.split.target-size", "2048").commit();

  // copy the unpartitioned table into the partitioned table to produce the partitioned data
  Dataset<Row> allRows = spark.read()
      .format("iceberg")
      .load(unpartitioned.toString());

  allRows
      .coalesce(1) // ensure only 1 file per partition is written
      .withColumn("part", callUDF(udf, column(partitionColumn)))
      .sortWithinPartitions("part")
      .drop("part")
      .write()
      .format("iceberg")
      .mode("append")
      .save(byId.location());

  return location;
}

Source File: IcebergSplitManager.java From presto with Apache License 2.0

6 votes

@Override
public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableHandle handle, SplitSchedulingStrategy splitSchedulingStrategy)
{
    IcebergTableHandle table = (IcebergTableHandle) handle;

    HiveMetastore metastore = transactionManager.get(transaction).getMetastore();
    Table icebergTable = getIcebergTable(metastore, hdfsEnvironment, session, table.getSchemaTableName());

    TableScan tableScan = getTableScan(session, table.getPredicate(), table.getSnapshotId(), icebergTable);

    // TODO Use residual. Right now there is no way to propagate residual to presto but at least we can
    //      propagate it at split level so the parquet pushdown can leverage it.
    IcebergSplitSource splitSource = new IcebergSplitSource(tableScan.planTasks());

    return new ClassLoaderSafeConnectorSplitSource(splitSource, Thread.currentThread().getContextClassLoader());
}

Source File: RowDataRewriter.java From iceberg with Apache License 2.0

6 votes

public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive,
                       Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) {
  this.schema = table.schema();
  this.spec = spec;
  this.locations = table.locationProvider();
  this.properties = table.properties();
  this.io = io;
  this.encryptionManager = encryptionManager;

  this.caseSensitive = caseSensitive;
  this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING);

  String formatString = table.properties().getOrDefault(
      TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
  this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
}

Source File: SnapshotsTable.java From presto with Apache License 2.0

6 votes

private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable)
{
    PageListBuilder pagesBuilder = PageListBuilder.forTable(tableMetadata);

    TimeZoneKey timeZoneKey = session.getTimeZoneKey();
    icebergTable.snapshots().forEach(snapshot -> {
        pagesBuilder.beginRow();
        pagesBuilder.appendTimestamp(packDateTimeWithZone(snapshot.timestampMillis(), timeZoneKey));
        pagesBuilder.appendBigint(snapshot.snapshotId());
        if (checkNonNull(snapshot.parentId(), pagesBuilder)) {
            pagesBuilder.appendBigint(snapshot.parentId());
        }
        if (checkNonNull(snapshot.operation(), pagesBuilder)) {
            pagesBuilder.appendVarchar(snapshot.operation());
        }
        if (checkNonNull(snapshot.manifestListLocation(), pagesBuilder)) {
            pagesBuilder.appendVarchar(snapshot.manifestListLocation());
        }
        if (checkNonNull(snapshot.summary(), pagesBuilder)) {
            pagesBuilder.appendVarcharVarcharMap(snapshot.summary());
        }
        pagesBuilder.endRow();
    });

    return pagesBuilder.build();
}

Source File: HadoopTables.java From iceberg with Apache License 2.0

6 votes

/**
 * Loads the table location from a FileSystem path location.
 *
 * @param location a path URI (e.g. hdfs:///warehouse/my_table/)
 * @return table implementation
 */
@Override
public Table load(String location) {
  TableOperations ops = newTableOps(location);
  if (ops.current() == null) {
    // try to resolve a metadata table, which we encode as URI fragments
    // e.g. hdfs:///warehouse/my_table#snapshots
    int hashIndex = location.lastIndexOf('#');
    if (hashIndex != -1 && location.length() - 1 != hashIndex) {
      // we found char '#', and it is not the last char of location
      String baseTable = location.substring(0, hashIndex);
      String metaTable = location.substring(hashIndex + 1);
      MetadataTableType type = MetadataTableType.from(metaTable);
      if (type != null) {
        return loadMetadataTable(baseTable, type);
      } else {
        throw new NoSuchTableException("Table does not exist at location: " + location);
      }
    } else {
      throw new NoSuchTableException("Table does not exist at location: " + location);
    }
  }

  return new BaseTable(ops, location);
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testProjection() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .project(projectedSchema);
  List<Record> outputRecords = readRecords(job.getConfiguration());
  Assert.assertEquals(inputRecords.size(), outputRecords.size());
  Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
}

Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0

6 votes

@Test
public void testReplaceTableTxnTableModifiedConcurrently() {
  Table table = catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());
  Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, false);

  // update the table concurrently
  table.updateProperties()
      .set("another-prop", "another-value")
      .commit();

  txn.updateProperties()
      .set("prop", "value")
      .commit();
  txn.commitTransaction();

  // the replace should still succeed
  table = catalog.loadTable(TABLE_IDENTIFIER);
  Assert.assertNull("Table props should be updated", table.properties().get("another-prop"));
  Assert.assertEquals("Table props should match", "value", table.properties().get("prop"));
}

Source File: IcebergSourceFlatDataBenchmark.java From iceberg with Apache License 2.0

6 votes

@Override
protected final Table initTable() {
  Schema schema = new Schema(
      required(1, "longCol", Types.LongType.get()),
      required(2, "intCol", Types.IntegerType.get()),
      required(3, "floatCol", Types.FloatType.get()),
      optional(4, "doubleCol", Types.DoubleType.get()),
      optional(5, "decimalCol", Types.DecimalType.of(20, 5)),
      optional(6, "dateCol", Types.DateType.get()),
      optional(7, "timestampCol", Types.TimestampType.withZone()),
      optional(8, "stringCol", Types.StringType.get()));
  PartitionSpec partitionSpec = PartitionSpec.unpartitioned();
  HadoopTables tables = new HadoopTables(hadoopConf());
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.METADATA_COMPRESSION, "gzip");
  return tables.create(schema, partitionSpec, properties, newTableLocation());
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testSnapshotReads() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(),
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  table.newAppend()
       .appendFile(writeFile(table, null, format, expectedRecords))
       .commit();
  long snapshotId = table.currentSnapshot().snapshotId();
  table.newAppend()
       .appendFile(writeFile(table, null, format, RandomGenericData.generate(table.schema(), 1, 0L)))
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .snapshotId(snapshotId);

  validate(job, expectedRecords);
}

Source File: TestHadoopCatalog.java From iceberg with Apache License 2.0

6 votes

@Test
public void testCreateAndDropTableWithoutNamespace() throws Exception {
  Configuration conf = new Configuration();
  String warehousePath = temp.newFolder().getAbsolutePath();
  HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath);

  TableIdentifier testTable = TableIdentifier.of("tbl");
  Table table = catalog.createTable(testTable, SCHEMA, PartitionSpec.unpartitioned());

  Assert.assertEquals(table.schema().toString(), TABLE_SCHEMA.toString());
  Assert.assertEquals("hadoop.tbl", table.toString());
  String metaLocation = catalog.defaultWarehouseLocation(testTable);

  FileSystem fs = Util.getFs(new Path(metaLocation), conf);
  Assert.assertTrue(fs.isDirectory(new Path(metaLocation)));

  catalog.dropTable(testTable);
  Assert.assertFalse(fs.isDirectory(new Path(metaLocation)));
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}

Source File: IcebergStorage.java From iceberg with Apache License 2.0

6 votes

private Table load(String location, Job job) throws IOException {
  if (iceberg == null) {
    Class<?> tablesImpl = job.getConfiguration().getClass(PIG_ICEBERG_TABLES_IMPL, HadoopTables.class);
    LOG.info("Initializing iceberg tables implementation: {}", tablesImpl);
    iceberg = (Tables) ReflectionUtils.newInstance(tablesImpl, job.getConfiguration());
  }

  Table result = tables.get(location);

  if (result == null) {
    try {
      LOG.info("[{}]: Loading table for location: {}", signature, location);
      result = iceberg.load(location);
      tables.put(location, result);
    } catch (Exception e) {
      throw new FrontendException("Failed to instantiate tables implementation", e);
    }
  }

  return result;
}

Source File: HiveTableTest.java From iceberg with Apache License 2.0

6 votes

@Test
public void testExistingTableUpdate() throws TException {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);
  // add a column
  icebergTable.updateSchema().addColumn("data", Types.LongType.get()).commit();

  icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  // Only 2 snapshotFile Should exist and no manifests should exist
  Assert.assertEquals(2, metadataVersionFiles(TABLE_NAME).size());
  Assert.assertEquals(0, manifestFiles(TABLE_NAME).size());
  Assert.assertEquals(altered.asStruct(), icebergTable.schema().asStruct());

  final org.apache.hadoop.hive.metastore.api.Table table = metastoreClient.getTable(DB_NAME, TABLE_NAME);
  final List<String> hiveColumns = table.getSd().getCols().stream()
      .map(FieldSchema::getName)
      .collect(Collectors.toList());
  final List<String> icebergColumns = altered.columns().stream()
      .map(Types.NestedField::name)
      .collect(Collectors.toList());
  Assert.assertEquals(icebergColumns, hiveColumns);
}

Source File: TestIcebergCTASWithPartition.java From dremio-oss with Apache License 2.0

5 votes

private void verifyPartitionValue(String tableFolder, Class expectedClass, Object expectedValue) {
  Table table = new HadoopTables(new Configuration()).load(tableFolder);
  for (FileScanTask fileScanTask : table.newScan().planFiles()) {
    StructLike structLike = fileScanTask.file().partition();
    Assert.assertEquals(structLike.get(0, expectedClass), expectedValue);
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testPartitionedByIdStartsWith() {
  Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id");

  CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of(
      "path", table.location())
  );

  SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);

  pushFilters(builder, new StringStartsWith("data", "junc"));
  Batch scan = builder.build().toBatch();

  Assert.assertEquals(1, scan.planInputPartitions().length);
}

Source File: SparkTable.java From iceberg with Apache License 2.0

5 votes

public SparkTable(Table icebergTable, StructType requestedSchema) {
  this.icebergTable = icebergTable;
  this.requestedSchema = requestedSchema;

  if (requestedSchema != null) {
    // convert the requested schema to throw an exception if any requested fields are unknown
    SparkSchemaUtil.convert(icebergTable.schema(), requestedSchema);
  }
}

Source File: SchemaEvolutionTest.java From iceberg with Apache License 2.0

5 votes

@Test
public void widenDecimalPrecision() throws IOException {
  // Set up a new table to test this conversion
  Schema schema = new Schema(optional(1, "decimal", Types.DecimalType.of(2, 2)));
  File location = Files.createTempDirectory("temp").toFile();
  HadoopTables tables = new HadoopTables(spark.sparkContext().hadoopConfiguration());
  Table decimalTable = tables.create(schema, location.toString());

  decimalTable.updateSchema().updateColumn("decimal", Types.DecimalType.of(4, 2)).commit();

  log.info("Widen decimal type:\n" + decimalTable.schema().toString());
}

Source File: TestRemoveOrphanFilesAction.java From iceberg with Apache License 2.0

5 votes

@Test
public void testOlderThanTimestamp() throws InterruptedException {
  Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation);

  List<ThreeColumnRecord> records = Lists.newArrayList(
      new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA")
  );
  Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1);

  df.select("c1", "c2", "c3")
      .write()
      .format("iceberg")
      .mode("append")
      .save(tableLocation);

  df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");
  df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");

  Thread.sleep(1000);

  long timestamp = System.currentTimeMillis();

  Thread.sleep(1000);

  df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA");

  Actions actions = Actions.forTable(table);

  List<String> result = actions.removeOrphanFiles()
      .olderThan(timestamp)
      .execute();

  Assert.assertEquals("Should delete only 2 files", 2, result.size());
}

Source File: TestRewriteDataFilesAction.java From iceberg with Apache License 2.0

5 votes

@Test
public void testRewriteDataFilesEmptyTable() {
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> options = Maps.newHashMap();
  Table table = TABLES.create(SCHEMA, spec, options, tableLocation);

  Assert.assertNull("Table must be empty", table.currentSnapshot());

  Actions actions = Actions.forTable(table);

  actions.rewriteDataFiles().execute();

  Assert.assertNull("Table must stay empty", table.currentSnapshot());
}

Source File: TestSparkTableUtilWithInMemoryCatalog.java From iceberg with Apache License 2.0

5 votes

@Test
public void testImportPartitions() throws IOException {
  Table table = TABLES.create(SCHEMA, SPEC, tableLocation);

  List<SimpleRecord> records = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  File parquetTableDir = temp.newFolder("parquet_table");
  String parquetTableLocation = parquetTableDir.toURI().toString();

  try {
    Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class);
    inputDF.select("id", "data").write()
        .format("parquet")
        .mode("append")
        .option("path", parquetTableLocation)
        .partitionBy("data")
        .saveAsTable("parquet_table");

    File stagingDir = temp.newFolder("staging-dir");
    List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'");
    SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString());

    List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a"));

    List<SimpleRecord> actualRecords = spark.read()
        .format("iceberg")
        .load(tableLocation)
        .orderBy("id")
        .as(Encoders.bean(SimpleRecord.class))
        .collectAsList();

    Assert.assertEquals("Result rows should match", expectedRecords, actualRecords);
  } finally {
    spark.sql("DROP TABLE parquet_table");
  }
}

Source File: IcebergMetastoreTables.java From metacat with Apache License 2.0

5 votes

@Override
public Table createTable(final TableIdentifier identifier,
                         final Schema schema,
                         final PartitionSpec spec,
                         final String location,
                         final Map<String, String> properties) {
    throw new MetacatNotSupportedException("not supported");
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

5 votes

private Table getTableAndResolveHadoopConfiguration(
    DataSourceOptions options, Configuration conf) {
  // Overwrite configurations from the Spark Context with configurations from the options.
  mergeIcebergHadoopConfs(conf, options.asMap());
  Table table = findTable(options, conf);
  // Set confs from table properties
  mergeIcebergHadoopConfs(conf, table.properties());
  // Re-overwrite values set in options and table properties but were not in the environment.
  mergeIcebergHadoopConfs(conf, options.asMap());
  return table;
}

Source File: TestIcebergPartitionData.java From dremio-oss with Apache License 2.0

5 votes

private void verifyPartitionValue(PartitionSpec partitionSpec, IcebergPartitionData partitionData,
                                  String columnName, Class expectedClass, Object expectedValue) throws Exception {
  File tableFolder = new File(folder.getRoot(), "icebergPartitionTest");
  try {
    tableFolder.mkdir();
    File dataFile = new File(folder.getRoot(), "a.parquet");

    dataFile.createNewFile();

    DataFile d1 = DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(dataFile))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartition(partitionData)
      .build();

    IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(Path.of(tableFolder.toPath().toString()),
      (new SchemaConverter()).fromIceberg(schema), Lists.newArrayList(columnName), new Configuration());
    committer.consumeData(Lists.newArrayList(d1));
    committer.commit();


    Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath());
    for (FileScanTask fileScanTask : table.newScan().planFiles()) {
      StructLike structLike = fileScanTask.file().partition();
      if (expectedClass == ByteBuffer.class) {
        Assert.assertEquals(structLike.get(0, expectedClass).hashCode(), ByteBuffer.wrap((byte[])expectedValue).hashCode());
      } else {
        Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue));
      }
    }

  }
  finally {
    tableFolder.delete();
  }

}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testPartitionedByDataStartsWithFilter() {
  Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");
  CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location()));

  SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options);

  pushFilters(builder, new StringStartsWith("data", "junc"));
  Batch scan = builder.build().toBatch();

  Assert.assertEquals(1, scan.planInputPartitions().length);
}

Source File: RewriteDataFilesAction.java From iceberg with Apache License 2.0

5 votes

RewriteDataFilesAction(SparkSession spark, Table table) {
  this.sparkContext = new JavaSparkContext(spark.sparkContext());
  this.table = table;
  this.spec = table.spec();
  this.filter = Expressions.alwaysTrue();
  this.caseSensitive = Boolean.parseBoolean(spark.conf().get("spark.sql.caseSensitive", "false"));

  long splitSize = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.SPLIT_SIZE,
      TableProperties.SPLIT_SIZE_DEFAULT);
  long targetFileSize = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.WRITE_TARGET_FILE_SIZE_BYTES,
      TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetSizeInBytes = Math.min(splitSize, targetFileSize);

  this.splitLookback = PropertyUtil.propertyAsInt(
      table.properties(),
      TableProperties.SPLIT_LOOKBACK,
      TableProperties.SPLIT_LOOKBACK_DEFAULT);
  this.splitOpenFileCost = PropertyUtil.propertyAsLong(
      table.properties(),
      TableProperties.SPLIT_OPEN_FILE_COST,
      TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT);

  this.fileIO = SparkUtil.serializableFileIO(table);
  this.encryptionManager = table.encryption();
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

5 votes

protected Table findTable(Map<String, String> options, Configuration conf) {
  Preconditions.checkArgument(options.containsKey("path"), "Cannot open table: path is not set");
  String path = options.get("path");

  if (path.contains("/")) {
    HadoopTables tables = new HadoopTables(conf);
    return tables.load(path);
  } else {
    HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf);
    TableIdentifier tableIdentifier = TableIdentifier.parse(path);
    return hiveCatalog.loadTable(tableIdentifier);
  }
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

5 votes

@Test
public void testFailedResidualFiltering() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
      ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
      location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  expectedRecords.get(1).set(2, "2020-03-20");

  DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
      .appendFile(dataFile1)
      .commit();

  Job jobShouldFail1 = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(jobShouldFail1);
  configBuilder.useHiveRows().readFrom(location.toString())
      .filter(Expressions.and(
          Expressions.equal("date", "2020-03-20"),
          Expressions.equal("id", 0)));
  AssertHelpers.assertThrows(
      "Residuals are not evaluated today for Iceberg Generics In memory model of HIVE",
      UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.",
      () -> validate(jobShouldFail1, expectedRecords));

  Job jobShouldFail2 = Job.getInstance(conf);
  configBuilder = IcebergInputFormat.configure(jobShouldFail2);
  configBuilder.usePigTuples().readFrom(location.toString())
      .filter(Expressions.and(
          Expressions.equal("date", "2020-03-20"),
          Expressions.equal("id", 0)));
  AssertHelpers.assertThrows(
      "Residuals are not evaluated today for Iceberg Generics In memory model of PIG",
      UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.",
      () -> validate(jobShouldFail2, expectedRecords));
}

org.apache.iceberg.Table Java Examples