org.apache.iceberg.AppendFiles Java Exaples

Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0

6 votes

@Test
public void testCreateTableTxnAndAppend() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());

  AppendFiles append = txn.newAppend();
  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();
  append.appendFile(dataFile);
  append.commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Snapshot snapshot = table.currentSnapshot();
  Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1);
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testIdentityPartitionProjections() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(LOG_SCHEMA, IDENTITY_PARTITION_SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());

  List<Record> inputRecords = RandomGenericData.generate(LOG_SCHEMA, 10, 0);
  Integer idx = 0;
  AppendFiles append = table.newAppend();
  for (Record record : inputRecords) {
    record.set(1, "2020-03-2" + idx);
    record.set(2, idx.toString());
    append.appendFile(writeFile(table, Row.of("2020-03-2" + idx, idx.toString()), format, ImmutableList.of(record)));
    idx += 1;
  }
  append.commit();

  // individual fields
  validateIdentityPartitionProjections(location.toString(), withColumns("date"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("id"), inputRecords);
  // field pairs
  validateIdentityPartitionProjections(location.toString(), withColumns("date", "message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level", "message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("date", "level"), inputRecords);
  // out-of-order pairs
  validateIdentityPartitionProjections(location.toString(), withColumns("message", "date"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("message", "level"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level", "date"), inputRecords);
  // full projection
  validateIdentityPartitionProjections(location.toString(), LOG_SCHEMA, inputRecords);
  // out-of-order triplets
  validateIdentityPartitionProjections(location.toString(), withColumns("date", "level", "message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level", "date", "message"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("date", "message", "level"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("level", "message", "date"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("message", "date", "level"), inputRecords);
  validateIdentityPartitionProjections(location.toString(), withColumns("message", "level", "date"), inputRecords);
}

Source File: IcebergMetadata.java From presto with Apache License 2.0

5 votes

@Override
public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics)
{
    IcebergWritableTableHandle table = (IcebergWritableTableHandle) insertHandle;
    org.apache.iceberg.Table icebergTable = transaction.table();

    List<CommitTaskData> commitTasks = fragments.stream()
            .map(slice -> commitTaskCodec.fromJson(slice.getBytes()))
            .collect(toImmutableList());

    Type[] partitionColumnTypes = icebergTable.spec().fields().stream()
            .map(field -> field.transform().getResultType(
                    icebergTable.schema().findType(field.sourceId())))
            .toArray(Type[]::new);

    AppendFiles appendFiles = transaction.newFastAppend();
    for (CommitTaskData task : commitTasks) {
        HdfsContext context = new HdfsContext(session, table.getSchemaName(), table.getTableName());
        Configuration configuration = hdfsEnvironment.getConfiguration(context, new Path(task.getPath()));

        DataFiles.Builder builder = DataFiles.builder(icebergTable.spec())
                .withInputFile(HadoopInputFile.fromLocation(task.getPath(), configuration))
                .withFormat(table.getFileFormat())
                .withMetrics(task.getMetrics().metrics());

        if (!icebergTable.spec().fields().isEmpty()) {
            String partitionDataJson = task.getPartitionDataJson()
                    .orElseThrow(() -> new VerifyException("No partition data for partitioned table"));
            builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes));
        }

        appendFiles.appendFile(builder.build());
    }

    appendFiles.commit();
    transaction.commitTransaction();

    return Optional.of(new HiveWrittenPartitions(commitTasks.stream()
            .map(CommitTaskData::getPath)
            .collect(toImmutableList())));
}

Source File: TestIcebergPartitions.java From dremio-oss with Apache License 2.0

5 votes

@Test
public void testNonIdentityPartitions() throws Exception {
  File root = tempDir.newFolder();
  HadoopTables tables = new HadoopTables(conf);
  PartitionSpec partitionSpec = PartitionSpec
      .builderFor(schema)
      .bucket(NAME, 2)
      .build();
  Table table = tables.create(schema, partitionSpec, root.getAbsolutePath());

  // Append some data files.
  Transaction transaction = table.newTransaction();
  AppendFiles appendFiles = transaction.newAppend();
  appendFiles.appendFile(createDataFile(root, "d1", 1, "jack", 100));
  appendFiles.appendFile(createDataFile(root, "d2", 1, "jack", 200));
  appendFiles.appendFile(createDataFile(root, "d3", 2, "jill", 300));
  appendFiles.appendFile(createDataFile(root, "d4", 2, "jill", 400));
  appendFiles.appendFile(createDataFile(root, "d5", 2, "jill", 500));
  appendFiles.commit();
  transaction.commitTransaction();

  try {
    IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(),
        HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo();
    fail("Expected error while reading metadata of iceberg table with non-identity partition field");
  } catch (Exception ex) {
    Assert.assertTrue("UserException expected", ex instanceof UserException);
    UserException uex = ((UserException) ex);
    Assert.assertEquals("Invalid ErrorType. Expected " + UserBitShared.DremioPBError.ErrorType.UNSUPPORTED_OPERATION
            + " but got " + uex.getErrorType(), UserBitShared.DremioPBError.ErrorType.UNSUPPORTED_OPERATION, uex.getErrorType());
    String expectedErrorMsg = "Column values and partition values are not same for [name] column";
    Assert.assertTrue("Expected message to contain " + expectedErrorMsg + " but was "
        + uex.getOriginalMessage() + " instead", uex.getOriginalMessage().contains(expectedErrorMsg));
  }
}

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

4 votes

/**
 * Import files from given partitions to an Iceberg table.
 *
 * @param spark a Spark session
 * @param partitions partitions to import
 * @param targetTable an Iceberg table where to import the data
 * @param spec a partition spec
 * @param stagingDir a staging directory to store temporary manifest files
 */
public static void importSparkPartitions(
    SparkSession spark, List<SparkPartition> partitions, Table targetTable, PartitionSpec spec, String stagingDir) {
  Configuration conf = spark.sessionState().newHadoopConf();
  SerializableConfiguration serializableConf = new SerializableConfiguration(conf);
  int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism());
  int numShufflePartitions = spark.sessionState().conf().numShufflePartitions();
  MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties());

  JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext());
  JavaRDD<SparkPartition> partitionRDD = sparkContext.parallelize(partitions, parallelism);

  Dataset<SparkPartition> partitionDS = spark.createDataset(
      partitionRDD.rdd(),
      Encoders.javaSerialization(SparkPartition.class));

  List<ManifestFile> manifests = partitionDS
      .flatMap((FlatMapFunction<SparkPartition, DataFile>) sparkPartition ->
              listPartition(sparkPartition, spec, serializableConf, metricsConfig).iterator(),
          Encoders.javaSerialization(DataFile.class))
      .repartition(numShufflePartitions)
      .map((MapFunction<DataFile, Tuple2<String, DataFile>>) file ->
              Tuple2.apply(file.path().toString(), file),
          Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class)))
      .orderBy(col("_1"))
      .mapPartitions(
          (MapPartitionsFunction<Tuple2<String, DataFile>, ManifestFile>) fileTuple ->
              buildManifest(serializableConf, spec, stagingDir, fileTuple),
          Encoders.javaSerialization(ManifestFile.class))
      .collectAsList();

  try {
    boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean(
        targetTable.properties(),
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED,
        TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT);

    AppendFiles append = targetTable.newAppend();
    manifests.forEach(append::appendManifest);
    append.commit();

    if (!snapshotIdInheritanceEnabled) {
      // delete original manifests as they were rewritten before the commit
      deleteManifests(targetTable.io(), manifests);
    }
  } catch (Throwable e) {
    deleteManifests(targetTable.io(), manifests);
    throw e;
  }
}

Source File: TestCreateTable.java From dremio-oss with Apache License 2.0

4 votes

@Test
public void testDroppingOfMapTypeColumn() throws Exception{
  String table1 = "iceberg_map_test";
  try {
    File table1Folder = new File(getDfsTestTmpSchemaLocation(), table1);
    HadoopTables hadoopTables = new HadoopTables(new Configuration());

    Schema schema = new Schema(
      Types.NestedField.optional(1, "col1", Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get())),
      Types.NestedField.optional(2, "col2", Types.IntegerType.get())
    );
    PartitionSpec spec = PartitionSpec
      .builderFor(schema)
      .build();
    Table table = hadoopTables.create(schema, spec, table1Folder.getPath());
    Transaction transaction = table.newTransaction();
    AppendFiles appendFiles = transaction.newAppend();
    final String testWorkingPath = TestTools.getWorkingPath() + "/src/test/resources/iceberg/mapTest";
    final String parquetFile = "iceberg_map_test.parquet";
    File dataFile = new File(testWorkingPath, parquetFile);
    appendFiles.appendFile(
      DataFiles.builder(spec)
        .withInputFile(Files.localInput(dataFile))
        .withRecordCount(1)
        .withFormat(FileFormat.PARQUET)
        .build()
    );
    appendFiles.commit();
    transaction.commitTransaction();

    testBuilder()
      .sqlQuery("select * from dfs_test.iceberg_map_test")
      .unOrdered()
      .baselineColumns("col2")
      .baselineValues(1)
      .build()
      .run();

    Thread.sleep(1001);
    String insertCommandSql = "insert into  dfs_test.iceberg_map_test select * from (values(2))";
    test(insertCommandSql);
    Thread.sleep(1001);

    testBuilder()
      .sqlQuery("select * from dfs_test.iceberg_map_test")
      .unOrdered()
      .baselineColumns("col2")
      .baselineValues(1)
      .baselineValues(2)
      .build()
      .run();
  }
  finally {
    FileUtils.deleteQuietly(new File(getDfsTestTmpSchemaLocation(), table1));
  }
}

Source File: TestIcebergTableDrop.java From dremio-oss with Apache License 2.0

4 votes

@Test
public void testDropTable() throws Exception {
  try (AutoCloseable c = enableIcebergTables()) {
    Path rootPath = Paths.get(getDfsTestTmpSchemaLocation(), "iceberg", "nation");
    Files.createDirectories(rootPath);
    String root = rootPath.toString();

    String tableName = "dfs_test.iceberg.nation";

    HadoopTables tables = new HadoopTables(conf);
    Table table = tables.create(schema, null, root);
    IcebergTableInfo tableInfo =
        new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root)
            .getTableInfo();
    assertEquals(tableInfo.getRecordCount(), 0);

    // Append some data files.
    Transaction transaction = table.newTransaction();
    AppendFiles appendFiles = transaction.newAppend();
    appendFiles.appendFile(createDataFile(rootPath.toFile(), "d1"));
    appendFiles.commit();
    transaction.commitTransaction();

    testBuilder()
        .sqlQuery("select count(*) c from " + tableName)
        .unOrdered()
        .baselineColumns("c")
        .baselineValues(25L)
        .build()
        .run();

    testBuilder()
        .sqlQuery("DROP TABLE " + tableName)
        .unOrdered()
        .baselineColumns("ok", "summary")
        .baselineValues(true, String.format("Table [%s] dropped", tableName))
        .build()
        .run();

    errorMsgTestHelper(
        "select count(*) c from " + tableName, "Table '" + tableName + "' not found");
  }
}

Source File: TestRefresh.java From dremio-oss with Apache License 2.0

4 votes

@Test
public void testRefresh() throws Exception {
  try (AutoCloseable c = enableIcebergTables()) {
    Path rootPath = Paths.get(getDfsTestTmpSchemaLocation(), "iceberg", "metadata_refresh");
    Files.createDirectories(rootPath);
    String root = rootPath.toString();
    String tableName = "dfs_test.iceberg.metadata_refresh";

    HadoopTables tables = new HadoopTables(conf);
    Table table = tables.create(schema, null, root);

    IcebergTableInfo tableInfo =
        new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root)
            .getTableInfo();
    assertEquals(tableInfo.getRecordCount(), 0);

    // Append some data files.
    Transaction transaction = table.newTransaction();
    AppendFiles appendFiles = transaction.newAppend();
    appendFiles.appendFile(createDataFile(rootPath.toFile(), "d1"));
    appendFiles.commit();
    transaction.commitTransaction();

    testBuilder()
        .sqlQuery("select count(*) c from " + tableName)
        .unOrdered()
        .baselineColumns("c")
        .baselineValues(25L)
        .build()
        .run();

    // to detect an mtime change.
    Thread.sleep(1000);

    // refresh without an update
    testBuilder()
        .sqlQuery("ALTER TABLE " + tableName + " REFRESH METADATA")
        .unOrdered()
        .baselineColumns("ok", "summary")
        .baselineValues(
            true,
            String.format(
                "Table '%s' read signature reviewed but source stated metadata is unchanged, no refresh occurred.",
                tableName))
        .build()
        .run();

    // Do another append
    transaction = table.newTransaction();
    appendFiles = transaction.newAppend();
    appendFiles.appendFile(createDataFile(rootPath.toFile(), "d2"));
    appendFiles.commit();
    transaction.commitTransaction();

    // refresh
    testBuilder()
        .sqlQuery("ALTER TABLE " + tableName + " REFRESH METADATA")
        .unOrdered()
        .baselineColumns("ok", "summary")
        .baselineValues(true, String.format("Metadata for table '%s' refreshed.", tableName))
        .build()
        .run();

    // validate increased row count
    testBuilder()
        .sqlQuery("select count(*) c from " + tableName)
        .unOrdered()
        .baselineColumns("c")
        .baselineValues(50L)
        .build()
        .run();
  }
}

Source File: TestIcebergPartitions.java From dremio-oss with Apache License 2.0

4 votes

@Test
public void testPartitions() throws Exception {
  File root = tempDir.newFolder();
  HadoopTables tables = new HadoopTables(conf);
  Table table = tables.create(schema, spec, root.getAbsolutePath());

  // test empty table.
  IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(),
    HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo();
  assertEquals(tableInfo.getRecordCount(), 0);

  List<String> expectedColumns = Arrays.asList(ID, NAME);
  assertEquals(expectedColumns, tableInfo.getPartitionColumns());

  assertEquals(0, ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()).size());

  // Append some data files.
  Transaction transaction = table.newTransaction();
  AppendFiles appendFiles = transaction.newAppend();
  appendFiles.appendFile(createDataFile(root, "d1", 1, "jack", 100));
  appendFiles.appendFile(createDataFile(root, "d2", 1, "jack", 200));
  appendFiles.appendFile(createDataFile(root, "d3", 2, "jill", 300));
  appendFiles.appendFile(createDataFile(root, "d4", 2, "jill", 400));
  appendFiles.appendFile(createDataFile(root, "d5", 2, "jill", 500));
  appendFiles.commit();
  transaction.commitTransaction();

  tableInfo = new IcebergTableWrapper(getSabotContext(),
    HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo();
  assertEquals(1500, tableInfo.getRecordCount());
  assertEquals(2, ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()).size());

  // validate first partition
  final AtomicLong recordCount = new AtomicLong(0);
  PartitionChunk p1 = findPartition(ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()), 1, "jack");
  assertNotNull(p1);
  assertEquals(2, p1.getSplitCount());
  p1.getSplits().iterator().forEachRemaining(x -> recordCount.addAndGet(x.getRecordCount()));
  assertEquals(300, recordCount.intValue());

  // validate second partition
  PartitionChunk p2 = findPartition(ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()), 2, "jill");
  assertNotNull(p2);

  assertEquals(3, p2.getSplitCount());
  recordCount.set(0);
  p2.getSplits().iterator().forEachRemaining(x -> recordCount.addAndGet(x.getRecordCount()));
  assertEquals(1200, recordCount.intValue());
}

org.apache.iceberg.AppendFiles Java Examples