org.apache.iceberg.AppendFiles Java Examples
The following examples show how to use
org.apache.iceberg.AppendFiles.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testCreateTableTxnAndAppend() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); Transaction txn = catalog.newCreateTableTransaction( TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); AppendFiles append = txn.newAppend(); DataFile dataFile = DataFiles.builder(SPEC) .withPath("/path/to/data-a.parquet") .withFileSizeInBytes(0) .withRecordCount(1) .build(); append.appendFile(dataFile); append.commit(); txn.commitTransaction(); Table table = catalog.loadTable(TABLE_IDENTIFIER); Snapshot snapshot = table.currentSnapshot(); Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1); }
Example #2
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testIdentityPartitionProjections() throws Exception { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); Table table = tables.create(LOG_SCHEMA, IDENTITY_PARTITION_SPEC, ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), location.toString()); List<Record> inputRecords = RandomGenericData.generate(LOG_SCHEMA, 10, 0); Integer idx = 0; AppendFiles append = table.newAppend(); for (Record record : inputRecords) { record.set(1, "2020-03-2" + idx); record.set(2, idx.toString()); append.appendFile(writeFile(table, Row.of("2020-03-2" + idx, idx.toString()), format, ImmutableList.of(record))); idx += 1; } append.commit(); // individual fields validateIdentityPartitionProjections(location.toString(), withColumns("date"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("level"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("message"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("id"), inputRecords); // field pairs validateIdentityPartitionProjections(location.toString(), withColumns("date", "message"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("level", "message"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("date", "level"), inputRecords); // out-of-order pairs validateIdentityPartitionProjections(location.toString(), withColumns("message", "date"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("message", "level"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("level", "date"), inputRecords); // full projection validateIdentityPartitionProjections(location.toString(), LOG_SCHEMA, inputRecords); // out-of-order triplets validateIdentityPartitionProjections(location.toString(), withColumns("date", "level", "message"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("level", "date", "message"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("date", "message", "level"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("level", "message", "date"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("message", "date", "level"), inputRecords); validateIdentityPartitionProjections(location.toString(), withColumns("message", "level", "date"), inputRecords); }
Example #3
Source File: IcebergMetadata.java From presto with Apache License 2.0 | 5 votes |
@Override public Optional<ConnectorOutputMetadata> finishInsert(ConnectorSession session, ConnectorInsertTableHandle insertHandle, Collection<Slice> fragments, Collection<ComputedStatistics> computedStatistics) { IcebergWritableTableHandle table = (IcebergWritableTableHandle) insertHandle; org.apache.iceberg.Table icebergTable = transaction.table(); List<CommitTaskData> commitTasks = fragments.stream() .map(slice -> commitTaskCodec.fromJson(slice.getBytes())) .collect(toImmutableList()); Type[] partitionColumnTypes = icebergTable.spec().fields().stream() .map(field -> field.transform().getResultType( icebergTable.schema().findType(field.sourceId()))) .toArray(Type[]::new); AppendFiles appendFiles = transaction.newFastAppend(); for (CommitTaskData task : commitTasks) { HdfsContext context = new HdfsContext(session, table.getSchemaName(), table.getTableName()); Configuration configuration = hdfsEnvironment.getConfiguration(context, new Path(task.getPath())); DataFiles.Builder builder = DataFiles.builder(icebergTable.spec()) .withInputFile(HadoopInputFile.fromLocation(task.getPath(), configuration)) .withFormat(table.getFileFormat()) .withMetrics(task.getMetrics().metrics()); if (!icebergTable.spec().fields().isEmpty()) { String partitionDataJson = task.getPartitionDataJson() .orElseThrow(() -> new VerifyException("No partition data for partitioned table")); builder.withPartition(PartitionData.fromJson(partitionDataJson, partitionColumnTypes)); } appendFiles.appendFile(builder.build()); } appendFiles.commit(); transaction.commitTransaction(); return Optional.of(new HiveWrittenPartitions(commitTasks.stream() .map(CommitTaskData::getPath) .collect(toImmutableList()))); }
Example #4
Source File: TestIcebergPartitions.java From dremio-oss with Apache License 2.0 | 5 votes |
@Test public void testNonIdentityPartitions() throws Exception { File root = tempDir.newFolder(); HadoopTables tables = new HadoopTables(conf); PartitionSpec partitionSpec = PartitionSpec .builderFor(schema) .bucket(NAME, 2) .build(); Table table = tables.create(schema, partitionSpec, root.getAbsolutePath()); // Append some data files. Transaction transaction = table.newTransaction(); AppendFiles appendFiles = transaction.newAppend(); appendFiles.appendFile(createDataFile(root, "d1", 1, "jack", 100)); appendFiles.appendFile(createDataFile(root, "d2", 1, "jack", 200)); appendFiles.appendFile(createDataFile(root, "d3", 2, "jill", 300)); appendFiles.appendFile(createDataFile(root, "d4", 2, "jill", 400)); appendFiles.appendFile(createDataFile(root, "d5", 2, "jill", 500)); appendFiles.commit(); transaction.commitTransaction(); try { IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo(); fail("Expected error while reading metadata of iceberg table with non-identity partition field"); } catch (Exception ex) { Assert.assertTrue("UserException expected", ex instanceof UserException); UserException uex = ((UserException) ex); Assert.assertEquals("Invalid ErrorType. Expected " + UserBitShared.DremioPBError.ErrorType.UNSUPPORTED_OPERATION + " but got " + uex.getErrorType(), UserBitShared.DremioPBError.ErrorType.UNSUPPORTED_OPERATION, uex.getErrorType()); String expectedErrorMsg = "Column values and partition values are not same for [name] column"; Assert.assertTrue("Expected message to contain " + expectedErrorMsg + " but was " + uex.getOriginalMessage() + " instead", uex.getOriginalMessage().contains(expectedErrorMsg)); } }
Example #5
Source File: SparkTableUtil.java From iceberg with Apache License 2.0 | 4 votes |
/** * Import files from given partitions to an Iceberg table. * * @param spark a Spark session * @param partitions partitions to import * @param targetTable an Iceberg table where to import the data * @param spec a partition spec * @param stagingDir a staging directory to store temporary manifest files */ public static void importSparkPartitions( SparkSession spark, List<SparkPartition> partitions, Table targetTable, PartitionSpec spec, String stagingDir) { Configuration conf = spark.sessionState().newHadoopConf(); SerializableConfiguration serializableConf = new SerializableConfiguration(conf); int parallelism = Math.min(partitions.size(), spark.sessionState().conf().parallelPartitionDiscoveryParallelism()); int numShufflePartitions = spark.sessionState().conf().numShufflePartitions(); MetricsConfig metricsConfig = MetricsConfig.fromProperties(targetTable.properties()); JavaSparkContext sparkContext = JavaSparkContext.fromSparkContext(spark.sparkContext()); JavaRDD<SparkPartition> partitionRDD = sparkContext.parallelize(partitions, parallelism); Dataset<SparkPartition> partitionDS = spark.createDataset( partitionRDD.rdd(), Encoders.javaSerialization(SparkPartition.class)); List<ManifestFile> manifests = partitionDS .flatMap((FlatMapFunction<SparkPartition, DataFile>) sparkPartition -> listPartition(sparkPartition, spec, serializableConf, metricsConfig).iterator(), Encoders.javaSerialization(DataFile.class)) .repartition(numShufflePartitions) .map((MapFunction<DataFile, Tuple2<String, DataFile>>) file -> Tuple2.apply(file.path().toString(), file), Encoders.tuple(Encoders.STRING(), Encoders.javaSerialization(DataFile.class))) .orderBy(col("_1")) .mapPartitions( (MapPartitionsFunction<Tuple2<String, DataFile>, ManifestFile>) fileTuple -> buildManifest(serializableConf, spec, stagingDir, fileTuple), Encoders.javaSerialization(ManifestFile.class)) .collectAsList(); try { boolean snapshotIdInheritanceEnabled = PropertyUtil.propertyAsBoolean( targetTable.properties(), TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED_DEFAULT); AppendFiles append = targetTable.newAppend(); manifests.forEach(append::appendManifest); append.commit(); if (!snapshotIdInheritanceEnabled) { // delete original manifests as they were rewritten before the commit deleteManifests(targetTable.io(), manifests); } } catch (Throwable e) { deleteManifests(targetTable.io(), manifests); throw e; } }
Example #6
Source File: TestCreateTable.java From dremio-oss with Apache License 2.0 | 4 votes |
@Test public void testDroppingOfMapTypeColumn() throws Exception{ String table1 = "iceberg_map_test"; try { File table1Folder = new File(getDfsTestTmpSchemaLocation(), table1); HadoopTables hadoopTables = new HadoopTables(new Configuration()); Schema schema = new Schema( Types.NestedField.optional(1, "col1", Types.MapType.ofOptional(1, 2, Types.IntegerType.get(), Types.StringType.get())), Types.NestedField.optional(2, "col2", Types.IntegerType.get()) ); PartitionSpec spec = PartitionSpec .builderFor(schema) .build(); Table table = hadoopTables.create(schema, spec, table1Folder.getPath()); Transaction transaction = table.newTransaction(); AppendFiles appendFiles = transaction.newAppend(); final String testWorkingPath = TestTools.getWorkingPath() + "/src/test/resources/iceberg/mapTest"; final String parquetFile = "iceberg_map_test.parquet"; File dataFile = new File(testWorkingPath, parquetFile); appendFiles.appendFile( DataFiles.builder(spec) .withInputFile(Files.localInput(dataFile)) .withRecordCount(1) .withFormat(FileFormat.PARQUET) .build() ); appendFiles.commit(); transaction.commitTransaction(); testBuilder() .sqlQuery("select * from dfs_test.iceberg_map_test") .unOrdered() .baselineColumns("col2") .baselineValues(1) .build() .run(); Thread.sleep(1001); String insertCommandSql = "insert into dfs_test.iceberg_map_test select * from (values(2))"; test(insertCommandSql); Thread.sleep(1001); testBuilder() .sqlQuery("select * from dfs_test.iceberg_map_test") .unOrdered() .baselineColumns("col2") .baselineValues(1) .baselineValues(2) .build() .run(); } finally { FileUtils.deleteQuietly(new File(getDfsTestTmpSchemaLocation(), table1)); } }
Example #7
Source File: TestIcebergTableDrop.java From dremio-oss with Apache License 2.0 | 4 votes |
@Test public void testDropTable() throws Exception { try (AutoCloseable c = enableIcebergTables()) { Path rootPath = Paths.get(getDfsTestTmpSchemaLocation(), "iceberg", "nation"); Files.createDirectories(rootPath); String root = rootPath.toString(); String tableName = "dfs_test.iceberg.nation"; HadoopTables tables = new HadoopTables(conf); Table table = tables.create(schema, null, root); IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root) .getTableInfo(); assertEquals(tableInfo.getRecordCount(), 0); // Append some data files. Transaction transaction = table.newTransaction(); AppendFiles appendFiles = transaction.newAppend(); appendFiles.appendFile(createDataFile(rootPath.toFile(), "d1")); appendFiles.commit(); transaction.commitTransaction(); testBuilder() .sqlQuery("select count(*) c from " + tableName) .unOrdered() .baselineColumns("c") .baselineValues(25L) .build() .run(); testBuilder() .sqlQuery("DROP TABLE " + tableName) .unOrdered() .baselineColumns("ok", "summary") .baselineValues(true, String.format("Table [%s] dropped", tableName)) .build() .run(); errorMsgTestHelper( "select count(*) c from " + tableName, "Table '" + tableName + "' not found"); } }
Example #8
Source File: TestRefresh.java From dremio-oss with Apache License 2.0 | 4 votes |
@Test public void testRefresh() throws Exception { try (AutoCloseable c = enableIcebergTables()) { Path rootPath = Paths.get(getDfsTestTmpSchemaLocation(), "iceberg", "metadata_refresh"); Files.createDirectories(rootPath); String root = rootPath.toString(); String tableName = "dfs_test.iceberg.metadata_refresh"; HadoopTables tables = new HadoopTables(conf); Table table = tables.create(schema, null, root); IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root) .getTableInfo(); assertEquals(tableInfo.getRecordCount(), 0); // Append some data files. Transaction transaction = table.newTransaction(); AppendFiles appendFiles = transaction.newAppend(); appendFiles.appendFile(createDataFile(rootPath.toFile(), "d1")); appendFiles.commit(); transaction.commitTransaction(); testBuilder() .sqlQuery("select count(*) c from " + tableName) .unOrdered() .baselineColumns("c") .baselineValues(25L) .build() .run(); // to detect an mtime change. Thread.sleep(1000); // refresh without an update testBuilder() .sqlQuery("ALTER TABLE " + tableName + " REFRESH METADATA") .unOrdered() .baselineColumns("ok", "summary") .baselineValues( true, String.format( "Table '%s' read signature reviewed but source stated metadata is unchanged, no refresh occurred.", tableName)) .build() .run(); // Do another append transaction = table.newTransaction(); appendFiles = transaction.newAppend(); appendFiles.appendFile(createDataFile(rootPath.toFile(), "d2")); appendFiles.commit(); transaction.commitTransaction(); // refresh testBuilder() .sqlQuery("ALTER TABLE " + tableName + " REFRESH METADATA") .unOrdered() .baselineColumns("ok", "summary") .baselineValues(true, String.format("Metadata for table '%s' refreshed.", tableName)) .build() .run(); // validate increased row count testBuilder() .sqlQuery("select count(*) c from " + tableName) .unOrdered() .baselineColumns("c") .baselineValues(50L) .build() .run(); } }
Example #9
Source File: TestIcebergPartitions.java From dremio-oss with Apache License 2.0 | 4 votes |
@Test public void testPartitions() throws Exception { File root = tempDir.newFolder(); HadoopTables tables = new HadoopTables(conf); Table table = tables.create(schema, spec, root.getAbsolutePath()); // test empty table. IcebergTableInfo tableInfo = new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo(); assertEquals(tableInfo.getRecordCount(), 0); List<String> expectedColumns = Arrays.asList(ID, NAME); assertEquals(expectedColumns, tableInfo.getPartitionColumns()); assertEquals(0, ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()).size()); // Append some data files. Transaction transaction = table.newTransaction(); AppendFiles appendFiles = transaction.newAppend(); appendFiles.appendFile(createDataFile(root, "d1", 1, "jack", 100)); appendFiles.appendFile(createDataFile(root, "d2", 1, "jack", 200)); appendFiles.appendFile(createDataFile(root, "d3", 2, "jill", 300)); appendFiles.appendFile(createDataFile(root, "d4", 2, "jill", 400)); appendFiles.appendFile(createDataFile(root, "d5", 2, "jill", 500)); appendFiles.commit(); transaction.commitTransaction(); tableInfo = new IcebergTableWrapper(getSabotContext(), HadoopFileSystem.get(fs), conf, root.getAbsolutePath()).getTableInfo(); assertEquals(1500, tableInfo.getRecordCount()); assertEquals(2, ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()).size()); // validate first partition final AtomicLong recordCount = new AtomicLong(0); PartitionChunk p1 = findPartition(ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()), 1, "jack"); assertNotNull(p1); assertEquals(2, p1.getSplitCount()); p1.getSplits().iterator().forEachRemaining(x -> recordCount.addAndGet(x.getRecordCount())); assertEquals(300, recordCount.intValue()); // validate second partition PartitionChunk p2 = findPartition(ImmutableList.copyOf(tableInfo.getPartitionChunkListing().iterator()), 2, "jill"); assertNotNull(p2); assertEquals(3, p2.getSplitCount()); recordCount.set(0); p2.getSplits().iterator().forEachRemaining(x -> recordCount.addAndGet(x.getRecordCount())); assertEquals(1200, recordCount.intValue()); }