org.apache.iceberg.Table Java Examples
The following examples show how to use
org.apache.iceberg.Table.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestSnapshotSelection.java From iceberg with Apache License 2.0 | 7 votes |
@Test(expected = IllegalArgumentException.class) public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, tableLocation); List<SimpleRecord> firstBatchRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); long timestamp = System.currentTimeMillis(); long snapshotId = table.currentSnapshot().snapshotId(); Dataset<Row> df = spark.read() .format("iceberg") .option("snapshot-id", snapshotId) .option("as-of-timestamp", timestamp) .load(tableLocation); df.collectAsList(); }
Example #2
Source File: TestHadoopCommits.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testStaleMetadata() throws Exception { Table tableCopy = TABLES.load(tableLocation); Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); // prepare changes on the copy without committing UpdateSchema updateCopy = tableCopy.updateSchema() .addColumn("m", Types.IntegerType.get()); updateCopy.apply(); table.updateSchema() .addColumn("n", Types.IntegerType.get()) .commit(); Assert.assertTrue("Should create v2 for the update", version(2).exists() && version(2).isFile()); Assert.assertNotEquals("Unmodified copy should be out of date after update", table.schema().asStruct(), tableCopy.schema().asStruct()); // update the table tableCopy.refresh(); Assert.assertEquals("Copy should be back in sync", table.schema().asStruct(), tableCopy.schema().asStruct()); AssertHelpers.assertThrows("Should fail with stale base metadata", CommitFailedException.class, "based on stale table metadata", updateCopy::commit); List<File> manifests = listManifestFiles(); Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size()); }
Example #3
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
RewriteManifestsAction(SparkSession spark, Table table) { this.spark = spark; this.sparkContext = new JavaSparkContext(spark.sparkContext()); this.manifestEncoder = Encoders.javaSerialization(ManifestFile.class); this.table = table; this.spec = table.spec(); this.targetManifestSizeBytes = PropertyUtil.propertyAsLong( table.properties(), TableProperties.MANIFEST_TARGET_SIZE_BYTES, TableProperties.MANIFEST_TARGET_SIZE_BYTES_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); // default the staging location to the metadata location TableOperations ops = ((HasTableOperations) table).operations(); Path metadataFilePath = new Path(ops.metadataFileLocation("file")); this.stagingLocation = metadataFilePath.getParent().toString(); // use the current table format version for new manifests this.formatVersion = ops.current().formatVersion(); }
Example #4
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
private File buildPartitionedTable(String desc, PartitionSpec spec, String udf, String partitionColumn) { File location = new File(parent, desc); Table byId = TABLES.create(SCHEMA, spec, location.toString()); // Do not combine or split files because the tests expect a split per partition. // A target split size of 2048 helps us achieve that. byId.updateProperties().set("read.split.target-size", "2048").commit(); // copy the unpartitioned table into the partitioned table to produce the partitioned data Dataset<Row> allRows = spark.read() .format("iceberg") .load(unpartitioned.toString()); allRows .coalesce(1) // ensure only 1 file per partition is written .withColumn("part", callUDF(udf, column(partitionColumn))) .sortWithinPartitions("part") .drop("part") .write() .format("iceberg") .mode("append") .save(byId.location()); return location; }
Example #5
Source File: IcebergSplitManager.java From presto with Apache License 2.0 | 6 votes |
@Override public ConnectorSplitSource getSplits(ConnectorTransactionHandle transaction, ConnectorSession session, ConnectorTableHandle handle, SplitSchedulingStrategy splitSchedulingStrategy) { IcebergTableHandle table = (IcebergTableHandle) handle; HiveMetastore metastore = transactionManager.get(transaction).getMetastore(); Table icebergTable = getIcebergTable(metastore, hdfsEnvironment, session, table.getSchemaTableName()); TableScan tableScan = getTableScan(session, table.getPredicate(), table.getSnapshotId(), icebergTable); // TODO Use residual. Right now there is no way to propagate residual to presto but at least we can // propagate it at split level so the parquet pushdown can leverage it. IcebergSplitSource splitSource = new IcebergSplitSource(tableScan.planTasks()); return new ClassLoaderSafeConnectorSplitSource(splitSource, Thread.currentThread().getContextClassLoader()); }
Example #6
Source File: RowDataRewriter.java From iceberg with Apache License 2.0 | 6 votes |
public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) { this.schema = table.schema(); this.spec = spec; this.locations = table.locationProvider(); this.properties = table.properties(); this.io = io; this.encryptionManager = encryptionManager; this.caseSensitive = caseSensitive; this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING); String formatString = table.properties().getOrDefault( TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); }
Example #7
Source File: SnapshotsTable.java From presto with Apache License 2.0 | 6 votes |
private static List<Page> buildPages(ConnectorTableMetadata tableMetadata, ConnectorSession session, Table icebergTable) { PageListBuilder pagesBuilder = PageListBuilder.forTable(tableMetadata); TimeZoneKey timeZoneKey = session.getTimeZoneKey(); icebergTable.snapshots().forEach(snapshot -> { pagesBuilder.beginRow(); pagesBuilder.appendTimestamp(packDateTimeWithZone(snapshot.timestampMillis(), timeZoneKey)); pagesBuilder.appendBigint(snapshot.snapshotId()); if (checkNonNull(snapshot.parentId(), pagesBuilder)) { pagesBuilder.appendBigint(snapshot.parentId()); } if (checkNonNull(snapshot.operation(), pagesBuilder)) { pagesBuilder.appendVarchar(snapshot.operation()); } if (checkNonNull(snapshot.manifestListLocation(), pagesBuilder)) { pagesBuilder.appendVarchar(snapshot.manifestListLocation()); } if (checkNonNull(snapshot.summary(), pagesBuilder)) { pagesBuilder.appendVarcharVarcharMap(snapshot.summary()); } pagesBuilder.endRow(); }); return pagesBuilder.build(); }
Example #8
Source File: HadoopTables.java From iceberg with Apache License 2.0 | 6 votes |
/** * Loads the table location from a FileSystem path location. * * @param location a path URI (e.g. hdfs:///warehouse/my_table/) * @return table implementation */ @Override public Table load(String location) { TableOperations ops = newTableOps(location); if (ops.current() == null) { // try to resolve a metadata table, which we encode as URI fragments // e.g. hdfs:///warehouse/my_table#snapshots int hashIndex = location.lastIndexOf('#'); if (hashIndex != -1 && location.length() - 1 != hashIndex) { // we found char '#', and it is not the last char of location String baseTable = location.substring(0, hashIndex); String metaTable = location.substring(hashIndex + 1); MetadataTableType type = MetadataTableType.from(metaTable); if (type != null) { return loadMetadataTable(baseTable, type); } else { throw new NoSuchTableException("Table does not exist at location: " + location); } } else { throw new NoSuchTableException("Table does not exist at location: " + location); } } return new BaseTable(ops, location); }
Example #9
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testProjection() throws Exception { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1)); Table table = tables.create(SCHEMA, SPEC, ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), location.toString()); List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L); DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords); table.newAppend() .appendFile(dataFile) .commit(); Job job = Job.getInstance(conf); IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job); configBuilder .readFrom(location.toString()) .project(projectedSchema); List<Record> outputRecords = readRecords(job.getConfiguration()); Assert.assertEquals(inputRecords.size(), outputRecords.size()); Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct()); }
Example #10
Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testReplaceTableTxnTableModifiedConcurrently() { Table table = catalog.createTable(TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); Assert.assertTrue("Table should exist", catalog.tableExists(TABLE_IDENTIFIER)); Transaction txn = catalog.newReplaceTableTransaction(TABLE_IDENTIFIER, SCHEMA, SPEC, false); // update the table concurrently table.updateProperties() .set("another-prop", "another-value") .commit(); txn.updateProperties() .set("prop", "value") .commit(); txn.commitTransaction(); // the replace should still succeed table = catalog.loadTable(TABLE_IDENTIFIER); Assert.assertNull("Table props should be updated", table.properties().get("another-prop")); Assert.assertEquals("Table props should match", "value", table.properties().get("prop")); }
Example #11
Source File: IcebergSourceFlatDataBenchmark.java From iceberg with Apache License 2.0 | 6 votes |
@Override protected final Table initTable() { Schema schema = new Schema( required(1, "longCol", Types.LongType.get()), required(2, "intCol", Types.IntegerType.get()), required(3, "floatCol", Types.FloatType.get()), optional(4, "doubleCol", Types.DoubleType.get()), optional(5, "decimalCol", Types.DecimalType.of(20, 5)), optional(6, "dateCol", Types.DateType.get()), optional(7, "timestampCol", Types.TimestampType.withZone()), optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.METADATA_COMPRESSION, "gzip"); return tables.create(schema, partitionSpec, properties, newTableLocation()); }
Example #12
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testSnapshotReads() throws Exception { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(), ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), location.toString()); List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L); table.newAppend() .appendFile(writeFile(table, null, format, expectedRecords)) .commit(); long snapshotId = table.currentSnapshot().snapshotId(); table.newAppend() .appendFile(writeFile(table, null, format, RandomGenericData.generate(table.schema(), 1, 0L))) .commit(); Job job = Job.getInstance(conf); IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job); configBuilder .readFrom(location.toString()) .snapshotId(snapshotId); validate(job, expectedRecords); }
Example #13
Source File: TestHadoopCatalog.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testCreateAndDropTableWithoutNamespace() throws Exception { Configuration conf = new Configuration(); String warehousePath = temp.newFolder().getAbsolutePath(); HadoopCatalog catalog = new HadoopCatalog(conf, warehousePath); TableIdentifier testTable = TableIdentifier.of("tbl"); Table table = catalog.createTable(testTable, SCHEMA, PartitionSpec.unpartitioned()); Assert.assertEquals(table.schema().toString(), TABLE_SCHEMA.toString()); Assert.assertEquals("hadoop.tbl", table.toString()); String metaLocation = catalog.defaultWarehouseLocation(testTable); FileSystem fs = Util.getFs(new Path(metaLocation), conf); Assert.assertTrue(fs.isDirectory(new Path(metaLocation))); catalog.dropTable(testTable); Assert.assertFalse(fs.isDirectory(new Path(metaLocation))); }
Example #14
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 6 votes |
@Override public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode, DataSourceOptions options) { Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite, "Save mode %s is not supported", mode); Configuration conf = new Configuration(lazyBaseConf()); Table table = getTableAndResolveHadoopConfiguration(options, conf); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options)); SparkUtil.validatePartitionTransforms(table.spec()); String appId = lazySparkSession().sparkContext().applicationId(); String wapId = lazySparkSession().conf().get("spark.wap.id", null); boolean replacePartitions = mode == SaveMode.Overwrite; Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption()); return Optional.of(new Writer( table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct)); }
Example #15
Source File: IcebergStorage.java From iceberg with Apache License 2.0 | 6 votes |
private Table load(String location, Job job) throws IOException { if (iceberg == null) { Class<?> tablesImpl = job.getConfiguration().getClass(PIG_ICEBERG_TABLES_IMPL, HadoopTables.class); LOG.info("Initializing iceberg tables implementation: {}", tablesImpl); iceberg = (Tables) ReflectionUtils.newInstance(tablesImpl, job.getConfiguration()); } Table result = tables.get(location); if (result == null) { try { LOG.info("[{}]: Loading table for location: {}", signature, location); result = iceberg.load(location); tables.put(location, result); } catch (Exception e) { throw new FrontendException("Failed to instantiate tables implementation", e); } } return result; }
Example #16
Source File: HiveTableTest.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testExistingTableUpdate() throws TException { Table icebergTable = catalog.loadTable(TABLE_IDENTIFIER); // add a column icebergTable.updateSchema().addColumn("data", Types.LongType.get()).commit(); icebergTable = catalog.loadTable(TABLE_IDENTIFIER); // Only 2 snapshotFile Should exist and no manifests should exist Assert.assertEquals(2, metadataVersionFiles(TABLE_NAME).size()); Assert.assertEquals(0, manifestFiles(TABLE_NAME).size()); Assert.assertEquals(altered.asStruct(), icebergTable.schema().asStruct()); final org.apache.hadoop.hive.metastore.api.Table table = metastoreClient.getTable(DB_NAME, TABLE_NAME); final List<String> hiveColumns = table.getSd().getCols().stream() .map(FieldSchema::getName) .collect(Collectors.toList()); final List<String> icebergColumns = altered.columns().stream() .map(Types.NestedField::name) .collect(Collectors.toList()); Assert.assertEquals(icebergColumns, hiveColumns); }
Example #17
Source File: TestIcebergCTASWithPartition.java From dremio-oss with Apache License 2.0 | 5 votes |
private void verifyPartitionValue(String tableFolder, Class expectedClass, Object expectedValue) { Table table = new HadoopTables(new Configuration()).load(tableFolder); for (FileScanTask fileScanTask : table.newScan().planFiles()) { StructLike structLike = fileScanTask.file().partition(); Assert.assertEquals(structLike.get(0, expectedClass), expectedValue); } }
Example #18
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testPartitionedByIdStartsWith() { Table table = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of( "path", table.location()) ); SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); Assert.assertEquals(1, scan.planInputPartitions().length); }
Example #19
Source File: SparkTable.java From iceberg with Apache License 2.0 | 5 votes |
public SparkTable(Table icebergTable, StructType requestedSchema) { this.icebergTable = icebergTable; this.requestedSchema = requestedSchema; if (requestedSchema != null) { // convert the requested schema to throw an exception if any requested fields are unknown SparkSchemaUtil.convert(icebergTable.schema(), requestedSchema); } }
Example #20
Source File: SchemaEvolutionTest.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void widenDecimalPrecision() throws IOException { // Set up a new table to test this conversion Schema schema = new Schema(optional(1, "decimal", Types.DecimalType.of(2, 2))); File location = Files.createTempDirectory("temp").toFile(); HadoopTables tables = new HadoopTables(spark.sparkContext().hadoopConfiguration()); Table decimalTable = tables.create(schema, location.toString()); decimalTable.updateSchema().updateColumn("decimal", Types.DecimalType.of(4, 2)).commit(); log.info("Widen decimal type:\n" + decimalTable.schema().toString()); }
Example #21
Source File: TestRemoveOrphanFilesAction.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testOlderThanTimestamp() throws InterruptedException { Table table = TABLES.create(SCHEMA, SPEC, Maps.newHashMap(), tableLocation); List<ThreeColumnRecord> records = Lists.newArrayList( new ThreeColumnRecord(1, "AAAAAAAAAA", "AAAA") ); Dataset<Row> df = spark.createDataFrame(records, ThreeColumnRecord.class).coalesce(1); df.select("c1", "c2", "c3") .write() .format("iceberg") .mode("append") .save(tableLocation); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); Thread.sleep(1000); long timestamp = System.currentTimeMillis(); Thread.sleep(1000); df.write().mode("append").parquet(tableLocation + "/data/c2_trunc=AA/c3=AAAA"); Actions actions = Actions.forTable(table); List<String> result = actions.removeOrphanFiles() .olderThan(timestamp) .execute(); Assert.assertEquals("Should delete only 2 files", 2, result.size()); }
Example #22
Source File: TestRewriteDataFilesAction.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testRewriteDataFilesEmptyTable() { PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); Assert.assertNull("Table must be empty", table.currentSnapshot()); Actions actions = Actions.forTable(table); actions.rewriteDataFiles().execute(); Assert.assertNull("Table must stay empty", table.currentSnapshot()); }
Example #23
Source File: TestSparkTableUtilWithInMemoryCatalog.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testImportPartitions() throws IOException { Table table = TABLES.create(SCHEMA, SPEC, tableLocation); List<SimpleRecord> records = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); File parquetTableDir = temp.newFolder("parquet_table"); String parquetTableLocation = parquetTableDir.toURI().toString(); try { Dataset<Row> inputDF = spark.createDataFrame(records, SimpleRecord.class); inputDF.select("id", "data").write() .format("parquet") .mode("append") .option("path", parquetTableLocation) .partitionBy("data") .saveAsTable("parquet_table"); File stagingDir = temp.newFolder("staging-dir"); List<SparkPartition> partitions = SparkTableUtil.getPartitionsByFilter(spark, "parquet_table", "data = 'a'"); SparkTableUtil.importSparkPartitions(spark, partitions, table, table.spec(), stagingDir.toString()); List<SimpleRecord> expectedRecords = Lists.newArrayList(new SimpleRecord(1, "a")); List<SimpleRecord> actualRecords = spark.read() .format("iceberg") .load(tableLocation) .orderBy("id") .as(Encoders.bean(SimpleRecord.class)) .collectAsList(); Assert.assertEquals("Result rows should match", expectedRecords, actualRecords); } finally { spark.sql("DROP TABLE parquet_table"); } }
Example #24
Source File: IcebergMetastoreTables.java From metacat with Apache License 2.0 | 5 votes |
@Override public Table createTable(final TableIdentifier identifier, final Schema schema, final PartitionSpec spec, final String location, final Map<String, String> properties) { throw new MetacatNotSupportedException("not supported"); }
Example #25
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 5 votes |
private Table getTableAndResolveHadoopConfiguration( DataSourceOptions options, Configuration conf) { // Overwrite configurations from the Spark Context with configurations from the options. mergeIcebergHadoopConfs(conf, options.asMap()); Table table = findTable(options, conf); // Set confs from table properties mergeIcebergHadoopConfs(conf, table.properties()); // Re-overwrite values set in options and table properties but were not in the environment. mergeIcebergHadoopConfs(conf, options.asMap()); return table; }
Example #26
Source File: TestIcebergPartitionData.java From dremio-oss with Apache License 2.0 | 5 votes |
private void verifyPartitionValue(PartitionSpec partitionSpec, IcebergPartitionData partitionData, String columnName, Class expectedClass, Object expectedValue) throws Exception { File tableFolder = new File(folder.getRoot(), "icebergPartitionTest"); try { tableFolder.mkdir(); File dataFile = new File(folder.getRoot(), "a.parquet"); dataFile.createNewFile(); DataFile d1 = DataFiles.builder(partitionSpec) .withInputFile(Files.localInput(dataFile)) .withRecordCount(50) .withFormat(FileFormat.PARQUET) .withPartition(partitionData) .build(); IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(Path.of(tableFolder.toPath().toString()), (new SchemaConverter()).fromIceberg(schema), Lists.newArrayList(columnName), new Configuration()); committer.consumeData(Lists.newArrayList(d1)); committer.commit(); Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath()); for (FileScanTask fileScanTask : table.newScan().planFiles()) { StructLike structLike = fileScanTask.file().partition(); if (expectedClass == ByteBuffer.class) { Assert.assertEquals(structLike.get(0, expectedClass).hashCode(), ByteBuffer.wrap((byte[])expectedValue).hashCode()); } else { Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue)); } } } finally { tableFolder.delete(); } }
Example #27
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testPartitionedByDataStartsWithFilter() { Table table = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); CaseInsensitiveStringMap options = new CaseInsensitiveStringMap(ImmutableMap.of("path", table.location())); SparkScanBuilder builder = new SparkScanBuilder(spark, TABLES.load(options.get("path")), options); pushFilters(builder, new StringStartsWith("data", "junc")); Batch scan = builder.build().toBatch(); Assert.assertEquals(1, scan.planInputPartitions().length); }
Example #28
Source File: RewriteDataFilesAction.java From iceberg with Apache License 2.0 | 5 votes |
RewriteDataFilesAction(SparkSession spark, Table table) { this.sparkContext = new JavaSparkContext(spark.sparkContext()); this.table = table; this.spec = table.spec(); this.filter = Expressions.alwaysTrue(); this.caseSensitive = Boolean.parseBoolean(spark.conf().get("spark.sql.caseSensitive", "false")); long splitSize = PropertyUtil.propertyAsLong( table.properties(), TableProperties.SPLIT_SIZE, TableProperties.SPLIT_SIZE_DEFAULT); long targetFileSize = PropertyUtil.propertyAsLong( table.properties(), TableProperties.WRITE_TARGET_FILE_SIZE_BYTES, TableProperties.WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); this.targetSizeInBytes = Math.min(splitSize, targetFileSize); this.splitLookback = PropertyUtil.propertyAsInt( table.properties(), TableProperties.SPLIT_LOOKBACK, TableProperties.SPLIT_LOOKBACK_DEFAULT); this.splitOpenFileCost = PropertyUtil.propertyAsLong( table.properties(), TableProperties.SPLIT_OPEN_FILE_COST, TableProperties.SPLIT_OPEN_FILE_COST_DEFAULT); this.fileIO = SparkUtil.serializableFileIO(table); this.encryptionManager = table.encryption(); }
Example #29
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 5 votes |
protected Table findTable(Map<String, String> options, Configuration conf) { Preconditions.checkArgument(options.containsKey("path"), "Cannot open table: path is not set"); String path = options.get("path"); if (path.contains("/")) { HadoopTables tables = new HadoopTables(conf); return tables.load(path); } else { HiveCatalog hiveCatalog = HiveCatalogs.loadCatalog(conf); TableIdentifier tableIdentifier = TableIdentifier.parse(path); return hiveCatalog.loadTable(tableIdentifier); } }
Example #30
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testFailedResidualFiltering() throws Exception { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); Table table = tables.create(SCHEMA, SPEC, ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), location.toString()); List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L); expectedRecords.get(0).set(2, "2020-03-20"); expectedRecords.get(1).set(2, "2020-03-20"); DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords); table.newAppend() .appendFile(dataFile1) .commit(); Job jobShouldFail1 = Job.getInstance(conf); IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(jobShouldFail1); configBuilder.useHiveRows().readFrom(location.toString()) .filter(Expressions.and( Expressions.equal("date", "2020-03-20"), Expressions.equal("id", 0))); AssertHelpers.assertThrows( "Residuals are not evaluated today for Iceberg Generics In memory model of HIVE", UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.", () -> validate(jobShouldFail1, expectedRecords)); Job jobShouldFail2 = Job.getInstance(conf); configBuilder = IcebergInputFormat.configure(jobShouldFail2); configBuilder.usePigTuples().readFrom(location.toString()) .filter(Expressions.and( Expressions.equal("date", "2020-03-20"), Expressions.equal("id", 0))); AssertHelpers.assertThrows( "Residuals are not evaluated today for Iceberg Generics In memory model of PIG", UnsupportedOperationException.class, "Filter expression ref(name=\"id\") == 0 is not completely satisfied.", () -> validate(jobShouldFail2, expectedRecords)); }