org.apache.iceberg.Table#refresh

Source File: TestHadoopCommits.java From iceberg with Apache License 2.0

6 votes

@Test
public void testStaleMetadata() throws Exception {
  Table tableCopy = TABLES.load(tableLocation);

  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  // prepare changes on the copy without committing
  UpdateSchema updateCopy = tableCopy.updateSchema()
      .addColumn("m", Types.IntegerType.get());
  updateCopy.apply();

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertNotEquals("Unmodified copy should be out of date after update",
      table.schema().asStruct(), tableCopy.schema().asStruct());

  // update the table
  tableCopy.refresh();

  Assert.assertEquals("Copy should be back in sync",
      table.schema().asStruct(), tableCopy.schema().asStruct());

  AssertHelpers.assertThrows("Should fail with stale base metadata",
      CommitFailedException.class, "based on stale table metadata", updateCopy::commit);

  List<File> manifests = listManifestFiles();
  Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size());
}

Source File: TestDataFrameWrites.java From iceberg with Apache License 2.0

5 votes

private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException {
  Schema tableSchema = table.schema(); // use the table schema because ids are reassigned

  table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit();

  Iterable<Record> expected = RandomData.generate(tableSchema, 100, 0L);
  writeData(expected, tableSchema, location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<Row> actual = result.collectAsList();

  Iterator<Record> expectedIter = expected.iterator();
  Iterator<Row> actualIter = actual.iterator();
  while (expectedIter.hasNext() && actualIter.hasNext()) {
    assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next());
  }
  Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext());

  table.currentSnapshot().addedFiles().forEach(dataFile ->
      Assert.assertTrue(
          String.format(
              "File should have the parent directory %s, but has: %s.",
              expectedDataDir.getAbsolutePath(),
              dataFile.path()),
          URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath())));
}

Source File: TestHadoopCommits.java From iceberg with Apache License 2.0

5 votes

@Test
public void testStaleVersionHint() throws Exception {
  Table stale = TABLES.load(tableLocation);

  Assert.assertTrue("Should create v1 metadata",
      version(1).exists() && version(1).isFile());
  Assert.assertFalse("Should not create v2 or newer versions",
      version(2).exists());

  table.updateSchema()
      .addColumn("n", Types.IntegerType.get())
      .commit();

  Assert.assertTrue("Should create v2 for the update",
      version(2).exists() && version(2).isFile());
  Assert.assertEquals("Should write the current version to the hint file",
      2, readVersionHint());

  Assert.assertNotEquals("Stable table schema should not match",
      UPDATED_SCHEMA.asStruct(), stale.schema().asStruct());

  // roll the version hint back to 1
  replaceVersionHint(1);

  Table reloaded = TABLES.load(tableLocation);
  Assert.assertEquals("Updated schema for newly loaded table should match",
      UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct());

  stale.refresh();
  Assert.assertEquals("Refreshed schema for stale table should match",
      UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct());
}

Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0

5 votes

@Test
public void testUnpartitionedOverwrite() throws IOException {
  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  // overwrite with the same data; should not produce two copies
  df.select("id", "data").write()
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("overwrite")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) {
  File location = new File(parent, desc);
  Table table = TABLES.create(SCHEMA, spec, location.toString());

  // Do not combine or split files because the tests expect a split per partition.
  // A target split size of 2048 helps us achieve that.
  table.updateProperties().set("read.split.target-size", "2048").commit();

  // copy the unpartitioned table into the partitioned table to produce the partitioned data
  Dataset<Row> allRows = spark.read()
      .format("iceberg")
      .load(unpartitioned.toString());

  allRows
      .coalesce(1) // ensure only 1 file per partition is written
      .withColumn("part", callUDF(udf, column(partitionColumn)))
      .sortWithinPartitions("part")
      .drop("part")
      .write()
      .format("iceberg")
      .mode("append")
      .save(table.location());

  table.refresh();

  return table;
}

Source File: TestHiveTableConcurrency.java From iceberg with Apache License 2.0

5 votes

@Test
public synchronized void testConcurrentFastAppends() {
  Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER);

  String fileName = UUID.randomUUID().toString();
  DataFile file = DataFiles.builder(icebergTable.spec())
      .withPath(FileFormat.PARQUET.addExtension(fileName))
      .withRecordCount(2)
      .withFileSizeInBytes(0)
      .build();

  ExecutorService executorService = MoreExecutors.getExitingExecutorService(
      (ThreadPoolExecutor) Executors.newFixedThreadPool(2));

  AtomicInteger barrier = new AtomicInteger(0);
  Tasks.range(2)
      .stopOnFailure().throwFailureWhenFinished()
      .executeWith(executorService)
      .run(index -> {
        for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) {
          while (barrier.get() < numCommittedFiles * 2) {
            try {
              Thread.sleep(10);
            } catch (InterruptedException e) {
              throw new RuntimeException(e);
            }
          }

          icebergTable.newFastAppend().appendFile(file).commit();
          barrier.incrementAndGet();
        }
      });

  icebergTable.refresh();
  Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size());
}

Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0

5 votes

@Test
public void testWriteProjection() throws IOException {
  Assume.assumeTrue(
      "Not supported in Spark 3.0; analysis requires all columns are present",
      spark.version().startsWith("2"));

  File parent = temp.newFolder(format.toString());
  File location = new File(parent, "test");

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Table table = tables.create(SCHEMA, spec, location.toString());

  List<SimpleRecord> expected = Lists.newArrayList(
      new SimpleRecord(1, null),
      new SimpleRecord(2, null),
      new SimpleRecord(3, null)
  );

  Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class);

  df.select("id").write() // select only id column
      .format("iceberg")
      .option("write-format", format.toString())
      .mode("append")
      .save(location.toString());

  table.refresh();

  Dataset<Row> result = spark.read()
      .format("iceberg")
      .load(location.toString());

  List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList();
  Assert.assertEquals("Number of rows should match", expected.size(), actual.size());
  Assert.assertEquals("Result rows should match", expected, actual);
}