Java Code Examples for org.apache.iceberg.Table#refresh()
The following examples show how to use
org.apache.iceberg.Table#refresh() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestHadoopCommits.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testStaleMetadata() throws Exception { Table tableCopy = TABLES.load(tableLocation); Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); // prepare changes on the copy without committing UpdateSchema updateCopy = tableCopy.updateSchema() .addColumn("m", Types.IntegerType.get()); updateCopy.apply(); table.updateSchema() .addColumn("n", Types.IntegerType.get()) .commit(); Assert.assertTrue("Should create v2 for the update", version(2).exists() && version(2).isFile()); Assert.assertNotEquals("Unmodified copy should be out of date after update", table.schema().asStruct(), tableCopy.schema().asStruct()); // update the table tableCopy.refresh(); Assert.assertEquals("Copy should be back in sync", table.schema().asStruct(), tableCopy.schema().asStruct()); AssertHelpers.assertThrows("Should fail with stale base metadata", CommitFailedException.class, "based on stale table metadata", updateCopy::commit); List<File> manifests = listManifestFiles(); Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size()); }
Example 2
Source File: TestDataFrameWrites.java From iceberg with Apache License 2.0 | 5 votes |
private void writeAndValidateWithLocations(Table table, File location, File expectedDataDir) throws IOException { Schema tableSchema = table.schema(); // use the table schema because ids are reassigned table.updateProperties().set(TableProperties.DEFAULT_FILE_FORMAT, format).commit(); Iterable<Record> expected = RandomData.generate(tableSchema, 100, 0L); writeData(expected, tableSchema, location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<Row> actual = result.collectAsList(); Iterator<Record> expectedIter = expected.iterator(); Iterator<Row> actualIter = actual.iterator(); while (expectedIter.hasNext() && actualIter.hasNext()) { assertEqualsSafe(tableSchema.asStruct(), expectedIter.next(), actualIter.next()); } Assert.assertEquals("Both iterators should be exhausted", expectedIter.hasNext(), actualIter.hasNext()); table.currentSnapshot().addedFiles().forEach(dataFile -> Assert.assertTrue( String.format( "File should have the parent directory %s, but has: %s.", expectedDataDir.getAbsolutePath(), dataFile.path()), URI.create(dataFile.path().toString()).getPath().startsWith(expectedDataDir.getAbsolutePath()))); }
Example 3
Source File: TestHadoopCommits.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testStaleVersionHint() throws Exception { Table stale = TABLES.load(tableLocation); Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); table.updateSchema() .addColumn("n", Types.IntegerType.get()) .commit(); Assert.assertTrue("Should create v2 for the update", version(2).exists() && version(2).isFile()); Assert.assertEquals("Should write the current version to the hint file", 2, readVersionHint()); Assert.assertNotEquals("Stable table schema should not match", UPDATED_SCHEMA.asStruct(), stale.schema().asStruct()); // roll the version hint back to 1 replaceVersionHint(1); Table reloaded = TABLES.load(tableLocation); Assert.assertEquals("Updated schema for newly loaded table should match", UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct()); stale.refresh(); Assert.assertEquals("Refreshed schema for stale table should match", UPDATED_SCHEMA.asStruct(), reloaded.schema().asStruct()); }
Example 4
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testUnpartitionedOverwrite() throws IOException { File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id", "data").write() .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); // overwrite with the same data; should not produce two copies df.select("id", "data").write() .format("iceberg") .option("write-format", format.toString()) .mode("overwrite") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example 5
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
private Table buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table table = TABLES.create(SCHEMA, spec, location.toString()); // Do not combine or split files because the tests expect a split per partition. // A target split size of 2048 helps us achieve that. table.updateProperties().set("read.split.target-size", "2048").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data Dataset<Row> allRows = spark.read() .format("iceberg") .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written .withColumn("part", callUDF(udf, column(partitionColumn))) .sortWithinPartitions("part") .drop("part") .write() .format("iceberg") .mode("append") .save(table.location()); table.refresh(); return table; }
Example 6
Source File: TestHiveTableConcurrency.java From iceberg with Apache License 2.0 | 5 votes |
@Test public synchronized void testConcurrentFastAppends() { Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER); String fileName = UUID.randomUUID().toString(); DataFile file = DataFiles.builder(icebergTable.spec()) .withPath(FileFormat.PARQUET.addExtension(fileName)) .withRecordCount(2) .withFileSizeInBytes(0) .build(); ExecutorService executorService = MoreExecutors.getExitingExecutorService( (ThreadPoolExecutor) Executors.newFixedThreadPool(2)); AtomicInteger barrier = new AtomicInteger(0); Tasks.range(2) .stopOnFailure().throwFailureWhenFinished() .executeWith(executorService) .run(index -> { for (int numCommittedFiles = 0; numCommittedFiles < 10; numCommittedFiles++) { while (barrier.get() < numCommittedFiles * 2) { try { Thread.sleep(10); } catch (InterruptedException e) { throw new RuntimeException(e); } } icebergTable.newFastAppend().appendFile(file).commit(); barrier.incrementAndGet(); } }); icebergTable.refresh(); Assert.assertEquals(20, icebergTable.currentSnapshot().allManifests().size()); }
Example 7
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testWriteProjection() throws IOException { Assume.assumeTrue( "Not supported in Spark 3.0; analysis requires all columns are present", spark.version().startsWith("2")); File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null) ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id").write() // select only id column .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example 8
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testAllEntriesTable() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table entriesTable = loadTable(tableIdentifier, "all_entries"); Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); Dataset<Row> df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "b")), SimpleRecord.class); df1.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // delete the first file to test that not only live files are listed table.newDelete().deleteFromRowFilter(Expressions.equal("id", 1)).commit(); // add a second file df2.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // ensure table data isn't stale table.refresh(); List<Row> actual = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier, "all_entries")) .orderBy("snapshot_id") .collectAsList(); List<GenericData.Record> expected = Lists.newArrayList(); for (ManifestFile manifest : Iterables.concat(Iterables.transform(table.snapshots(), Snapshot::allManifests))) { InputFile in = table.io().newInputFile(manifest.path()); try (CloseableIterable<GenericData.Record> rows = Avro.read(in).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number rows.forEach(row -> { row.put(2, 0L); GenericData.Record file = (GenericData.Record) row.get("data_file"); file.put(0, FileContent.DATA.id()); expected.add(row); }); } } expected.sort(Comparator.comparing(o -> (Long) o.get("snapshot_id"))); Assert.assertEquals("Entries table should have 3 rows", 3, expected.size()); Assert.assertEquals("Actual results should have 3 rows", 3, actual.size()); for (int i = 0; i < expected.size(); i += 1) { TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(i), actual.get(i)); } }
Example 9
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testPartitionsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "partitions_test"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); Table partitionsTable = loadTable(tableIdentifier, "partitions"); Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); Dataset<Row> df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); df1.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); long firstCommitId = table.currentSnapshot().snapshotId(); // add a second file df2.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); List<Row> actual = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier, "partitions")) .orderBy("partition.id") .collectAsList(); GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( partitionsTable.schema(), "partitions")); GenericRecordBuilder partitionBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( partitionsTable.schema().findType("partition").asStructType(), "partition")); List<GenericData.Record> expected = Lists.newArrayList(); expected.add(builder .set("partition", partitionBuilder.set("id", 1).build()) .set("record_count", 1L) .set("file_count", 1) .build()); expected.add(builder .set("partition", partitionBuilder.set("id", 2).build()) .set("record_count", 1L) .set("file_count", 1) .build()); Assert.assertEquals("Partitions table should have two rows", 2, expected.size()); Assert.assertEquals("Actual results should have two rows", 2, actual.size()); for (int i = 0; i < 2; i += 1) { TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(i), actual.get(i)); } // check time travel List<Row> actualAfterFirstCommit = spark.read() .format("iceberg") .option("snapshot-id", String.valueOf(firstCommitId)) .load(loadLocation(tableIdentifier, "partitions")) .orderBy("partition.id") .collectAsList(); Assert.assertEquals("Actual results should have one row", 1, actualAfterFirstCommit.size()); TestHelpers.assertEqualsSafe(partitionsTable.schema().asStruct(), expected.get(0), actualAfterFirstCommit.get(0)); }
Example 10
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testAllManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); Table manifestTable = loadTable(tableIdentifier, "all_manifests"); Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); List<ManifestFile> manifests = Lists.newArrayList(); df1.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); manifests.addAll(table.currentSnapshot().allManifests()); table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); manifests.addAll(table.currentSnapshot().allManifests()); List<Row> actual = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier, "all_manifests")) .orderBy("path") .collectAsList(); table.refresh(); GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( manifestTable.schema(), "manifests")); GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); List<GenericData.Record> expected = Lists.newArrayList(Iterables.transform(manifests, manifest -> builder.set("path", manifest.path()) .set("length", manifest.length()) .set("partition_spec_id", manifest.partitionSpecId()) .set("added_snapshot_id", manifest.snapshotId()) .set("added_data_files_count", manifest.addedFilesCount()) .set("existing_data_files_count", manifest.existingFilesCount()) .set("deleted_data_files_count", manifest.deletedFilesCount()) .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> summaryBuilder .set("contains_null", false) .set("lower_bound", "1") .set("upper_bound", "1") .build() )) .build() )); expected.sort(Comparator.comparing(o -> o.get("path").toString())); Assert.assertEquals("Manifests table should have two manifest rows", 2, actual.size()); for (int i = 0; i < expected.size(); i += 1) { TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(i), actual.get(i)); } }
Example 11
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testManifestsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "manifests_test"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.builderFor(SCHEMA).identity("id").build()); Table manifestTable = loadTable(tableIdentifier, "manifests"); Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); df1.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); List<Row> actual = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier, "manifests")) .collectAsList(); table.refresh(); GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert( manifestTable.schema(), "manifests")); GenericRecordBuilder summaryBuilder = new GenericRecordBuilder(AvroSchemaUtil.convert( manifestTable.schema().findType("partition_summaries.element").asStructType(), "partition_summary")); List<GenericData.Record> expected = Lists.transform(table.currentSnapshot().allManifests(), manifest -> builder.set("path", manifest.path()) .set("length", manifest.length()) .set("partition_spec_id", manifest.partitionSpecId()) .set("added_snapshot_id", manifest.snapshotId()) .set("added_data_files_count", manifest.addedFilesCount()) .set("existing_data_files_count", manifest.existingFilesCount()) .set("deleted_data_files_count", manifest.deletedFilesCount()) .set("partition_summaries", Lists.transform(manifest.partitions(), partition -> summaryBuilder .set("contains_null", false) .set("lower_bound", "1") .set("upper_bound", "1") .build() )) .build() ); Assert.assertEquals("Manifests table should have one manifest row", 1, actual.size()); TestHelpers.assertEqualsSafe(manifestTable.schema().asStruct(), expected.get(0), actual.get(0)); }
Example 12
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testSnapshotsTable() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "snapshots_test"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table snapTable = loadTable(tableIdentifier, "snapshots"); List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset<Row> inputDf = spark.createDataFrame(records, SimpleRecord.class); inputDf.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); long firstSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long firstSnapshotId = table.currentSnapshot().snapshotId(); String firstManifestList = table.currentSnapshot().manifestListLocation(); table.newDelete().deleteFromRowFilter(Expressions.alwaysTrue()).commit(); long secondSnapshotTimestamp = table.currentSnapshot().timestampMillis(); long secondSnapshotId = table.currentSnapshot().snapshotId(); String secondManifestList = table.currentSnapshot().manifestListLocation(); // rollback the table state to the first snapshot table.rollback().toSnapshotId(firstSnapshotId).commit(); List<Row> actual = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier, "snapshots")) .collectAsList(); GenericRecordBuilder builder = new GenericRecordBuilder(AvroSchemaUtil.convert(snapTable.schema(), "snapshots")); List<GenericData.Record> expected = Lists.newArrayList( builder.set("committed_at", firstSnapshotTimestamp * 1000) .set("snapshot_id", firstSnapshotId) .set("parent_id", null) .set("operation", "append") .set("manifest_list", firstManifestList) .set("summary", ImmutableMap.of( "added-records", "1", "added-data-files", "1", "changed-partition-count", "1", "total-data-files", "1", "total-records", "1" )) .build(), builder.set("committed_at", secondSnapshotTimestamp * 1000) .set("snapshot_id", secondSnapshotId) .set("parent_id", firstSnapshotId) .set("operation", "delete") .set("manifest_list", secondManifestList) .set("summary", ImmutableMap.of( "deleted-records", "1", "deleted-data-files", "1", "changed-partition-count", "1", "total-records", "0", "total-data-files", "0" )) .build() ); Assert.assertEquals("Snapshots table should have a row for each snapshot", 2, actual.size()); TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(0), actual.get(0)); TestHelpers.assertEqualsSafe(snapTable.schema().asStruct(), expected.get(1), actual.get(1)); }
Example 13
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testWriteProjectionWithMiddle() throws IOException { Assume.assumeTrue( "Not supported in Spark 3.0; analysis requires all columns are present", spark.version().startsWith("2")); File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Schema schema = new Schema( optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()) ); Table table = tables.create(schema, spec, location.toString()); List<ThreeColumnRecord> expected = Lists.newArrayList( new ThreeColumnRecord(1, null, "hello"), new ThreeColumnRecord(2, null, "world"), new ThreeColumnRecord(3, null, null) ); Dataset<Row> df = spark.createDataFrame(expected, ThreeColumnRecord.class); df.select("c1", "c3").write() .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<ThreeColumnRecord> actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example 14
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testFilesUnpartitionedTable() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "unpartitioned_files_test"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table entriesTable = loadTable(tableIdentifier, "entries"); Table filesTable = loadTable(tableIdentifier, "files"); Dataset<Row> df1 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(1, "a")), SimpleRecord.class); Dataset<Row> df2 = spark.createDataFrame(Lists.newArrayList(new SimpleRecord(2, "b")), SimpleRecord.class); df1.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); DataFile toDelete = Iterables.getOnlyElement(table.currentSnapshot().addedFiles()); // add a second file df2.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); // delete the first file to test that only live files are listed table.newDelete().deleteFile(toDelete).commit(); List<Row> actual = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier, "files")) .collectAsList(); List<GenericData.Record> expected = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().dataManifests()) { InputFile in = table.io().newInputFile(manifest.path()); try (CloseableIterable<GenericData.Record> rows = Avro.read(in).project(entriesTable.schema()).build()) { for (GenericData.Record record : rows) { if ((Integer) record.get("status") < 2 /* added or existing */) { GenericData.Record file = (GenericData.Record) record.get("data_file"); file.put(0, FileContent.DATA.id()); expected.add(file); } } } } Assert.assertEquals("Files table should have one row", 1, expected.size()); Assert.assertEquals("Actual results should have one row", 1, actual.size()); TestHelpers.assertEqualsSafe(filesTable.schema().asStruct(), expected.get(0), actual.get(0)); }
Example 15
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testEntriesTableWithSnapshotIdInheritance() { TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_inheritance_test"); PartitionSpec spec = PartitionSpec.builderFor(SCHEMA).identity("id").build(); Table table = createTable(tableIdentifier, SCHEMA, spec); table.updateProperties() .set(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, "true") .commit(); List<SimpleRecord> records = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b") ); Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class); inputDF.select("id", "data").write() .format("parquet") .mode("overwrite") .partitionBy("id") .saveAsTable("parquet_table"); try { String stagingLocation = table.location() + "/metadata"; SparkTableUtil.importSparkTable( spark, new org.apache.spark.sql.catalyst.TableIdentifier("parquet_table"), table, stagingLocation); List<Row> actual = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier, "entries")) .select("sequence_number", "snapshot_id", "data_file") .collectAsList(); table.refresh(); long snapshotId = table.currentSnapshot().snapshotId(); Assert.assertEquals("Entries table should have 2 rows", 2, actual.size()); Assert.assertEquals("Sequence number must match", 0, actual.get(0).getLong(0)); Assert.assertEquals("Snapshot id must match", snapshotId, actual.get(0).getLong(1)); Assert.assertEquals("Sequence number must match", 0, actual.get(1).getLong(0)); Assert.assertEquals("Snapshot id must match", snapshotId, actual.get(1).getLong(1)); } finally { spark.sql("DROP TABLE parquet_table"); } }
Example 16
Source File: TestIcebergSourceTablesBase.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testEntriesTable() throws Exception { TableIdentifier tableIdentifier = TableIdentifier.of("db", "entries_test"); Table table = createTable(tableIdentifier, SCHEMA, PartitionSpec.unpartitioned()); Table entriesTable = loadTable(tableIdentifier, "entries"); List<SimpleRecord> records = Lists.newArrayList(new SimpleRecord(1, "1")); Dataset<Row> inputDf = spark.createDataFrame(records, SimpleRecord.class); inputDf.select("id", "data").write() .format("iceberg") .mode("append") .save(loadLocation(tableIdentifier)); table.refresh(); List<Row> actual = spark.read() .format("iceberg") .load(loadLocation(tableIdentifier, "entries")) .collectAsList(); Snapshot snapshot = table.currentSnapshot(); Assert.assertEquals("Should only contain one manifest", 1, snapshot.allManifests().size()); InputFile manifest = table.io().newInputFile(snapshot.allManifests().get(0).path()); List<GenericData.Record> expected = Lists.newArrayList(); try (CloseableIterable<GenericData.Record> rows = Avro.read(manifest).project(entriesTable.schema()).build()) { // each row must inherit snapshot_id and sequence_number rows.forEach(row -> { row.put(2, 0L); GenericData.Record file = (GenericData.Record) row.get("data_file"); file.put(0, FileContent.DATA.id()); expected.add(row); }); } Assert.assertEquals("Entries table should have one row", 1, expected.size()); Assert.assertEquals("Actual results should have one row", 1, actual.size()); TestHelpers.assertEqualsSafe(entriesTable.schema().asStruct(), expected.get(0), actual.get(0)); }
Example 17
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testUnpartitionedCreateWithTargetFileSizeViaTableProperties() throws IOException { File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); table.updateProperties() .set(TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, "4") // ~4 bytes; low enough to trigger .commit(); List<SimpleRecord> expected = Lists.newArrayListWithCapacity(4000); for (int i = 0; i < 4000; i++) { expected.add(new SimpleRecord(i, "a")); } Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id", "data").write() .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); List<DataFile> files = Lists.newArrayList(); for (ManifestFile manifest : table.currentSnapshot().allManifests()) { for (DataFile file : ManifestFiles.read(manifest, table.io())) { files.add(file); } } // TODO: ORC file now not support target file size if (!format.equals(FileFormat.ORC)) { Assert.assertEquals("Should have 4 DataFiles", 4, files.size()); Assert.assertTrue("All DataFiles contain 1000 rows", files.stream().allMatch(d -> d.recordCount() == 1000)); } }
Example 18
Source File: TestRewriteDataFilesAction.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testRewriteDataFilesPartitionedTable() { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) .identity("c1") .truncate("c2", 2) .build(); Map<String, String> options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); List<ThreeColumnRecord> records1 = Lists.newArrayList( new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA"), new ThreeColumnRecord(1, "AAAAAAAAAA", "CCCC") ); writeRecords(records1); List<ThreeColumnRecord> records2 = Lists.newArrayList( new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB"), new ThreeColumnRecord(1, "BBBBBBBBBB", "DDDD") ); writeRecords(records2); List<ThreeColumnRecord> records3 = Lists.newArrayList( new ThreeColumnRecord(2, "AAAAAAAAAA", "EEEE"), new ThreeColumnRecord(2, "AAAAAAAAAA", "GGGG") ); writeRecords(records3); List<ThreeColumnRecord> records4 = Lists.newArrayList( new ThreeColumnRecord(2, "BBBBBBBBBB", "FFFF"), new ThreeColumnRecord(2, "BBBBBBBBBB", "HHHH") ); writeRecords(records4); table.refresh(); CloseableIterable<FileScanTask> tasks = table.newScan().planFiles(); List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 8 data files before rewrite", 8, dataFiles.size()); Actions actions = Actions.forTable(table); RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute(); Assert.assertEquals("Action should rewrite 8 data files", 8, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 4 data file", 4, result.addedDataFiles().size()); table.refresh(); CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles(); List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles1.size()); List<ThreeColumnRecord> expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); expectedRecords.addAll(records3); expectedRecords.addAll(records4); Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation); List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2", "c3") .as(Encoders.bean(ThreeColumnRecord.class)) .collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); }
Example 19
Source File: TestRewriteManifestsAction.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testRewriteSmallManifestsPartitionedTable() throws IOException { PartitionSpec spec = PartitionSpec.builderFor(SCHEMA) .identity("c1") .truncate("c2", 2) .build(); Map<String, String> options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); List<ThreeColumnRecord> records1 = Lists.newArrayList( new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") ); writeRecords(records1); List<ThreeColumnRecord> records2 = Lists.newArrayList( new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") ); writeRecords(records2); List<ThreeColumnRecord> records3 = Lists.newArrayList( new ThreeColumnRecord(3, "EEEEEEEEEE", "EEEE"), new ThreeColumnRecord(3, "FFFFFFFFFF", "FFFF") ); writeRecords(records3); List<ThreeColumnRecord> records4 = Lists.newArrayList( new ThreeColumnRecord(4, "GGGGGGGGGG", "GGGG"), new ThreeColumnRecord(4, "HHHHHHHHHG", "HHHH") ); writeRecords(records4); table.refresh(); List<ManifestFile> manifests = table.currentSnapshot().allManifests(); Assert.assertEquals("Should have 4 manifests before rewrite", 4, manifests.size()); Actions actions = Actions.forTable(table); // we will expect to have 2 manifests with 4 entries in each after rewrite long manifestEntrySizeBytes = computeManifestEntrySizeBytes(manifests); long targetManifestSizeBytes = (long) (1.05 * 4 * manifestEntrySizeBytes); table.updateProperties() .set(TableProperties.MANIFEST_TARGET_SIZE_BYTES, String.valueOf(targetManifestSizeBytes)) .commit(); RewriteManifestsActionResult result = actions.rewriteManifests() .rewriteIf(manifest -> true) .execute(); Assert.assertEquals("Action should rewrite 4 manifests", 4, result.deletedManifests().size()); Assert.assertEquals("Action should add 2 manifests", 2, result.addedManifests().size()); table.refresh(); List<ManifestFile> newManifests = table.currentSnapshot().allManifests(); Assert.assertEquals("Should have 2 manifests after rewrite", 2, newManifests.size()); Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount()); Assert.assertFalse(newManifests.get(0).hasAddedFiles()); Assert.assertFalse(newManifests.get(0).hasDeletedFiles()); Assert.assertEquals(4, (long) newManifests.get(1).existingFilesCount()); Assert.assertFalse(newManifests.get(1).hasAddedFiles()); Assert.assertFalse(newManifests.get(1).hasDeletedFiles()); List<ThreeColumnRecord> expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); expectedRecords.addAll(records3); expectedRecords.addAll(records4); Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation); List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2") .as(Encoders.bean(ThreeColumnRecord.class)) .collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); }
Example 20
Source File: TestRewriteManifestsAction.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testRewriteSmallManifestsNonPartitionedTable() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); List<ThreeColumnRecord> records1 = Lists.newArrayList( new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") ); writeRecords(records1); List<ThreeColumnRecord> records2 = Lists.newArrayList( new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") ); writeRecords(records2); table.refresh(); List<ManifestFile> manifests = table.currentSnapshot().allManifests(); Assert.assertEquals("Should have 2 manifests before rewrite", 2, manifests.size()); Actions actions = Actions.forTable(table); RewriteManifestsActionResult result = actions.rewriteManifests() .rewriteIf(manifest -> true) .execute(); Assert.assertEquals("Action should rewrite 2 manifests", 2, result.deletedManifests().size()); Assert.assertEquals("Action should add 1 manifests", 1, result.addedManifests().size()); table.refresh(); List<ManifestFile> newManifests = table.currentSnapshot().allManifests(); Assert.assertEquals("Should have 1 manifests after rewrite", 1, newManifests.size()); Assert.assertEquals(4, (long) newManifests.get(0).existingFilesCount()); Assert.assertFalse(newManifests.get(0).hasAddedFiles()); Assert.assertFalse(newManifests.get(0).hasDeletedFiles()); List<ThreeColumnRecord> expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation); List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2") .as(Encoders.bean(ThreeColumnRecord.class)) .collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); }