Java Code Examples for org.apache.iceberg.PartitionSpec#unpartitioned()
The following examples show how to use
org.apache.iceberg.PartitionSpec#unpartitioned() .
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example 1
Source File: TestSnapshotSelection.java From iceberg with Apache License 2.0 | 7 votes |
@Test(expected = IllegalArgumentException.class) public void testSnapshotSelectionBySnapshotIdAndTimestamp() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, tableLocation); List<SimpleRecord> firstBatchRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> firstDf = spark.createDataFrame(firstBatchRecords, SimpleRecord.class); firstDf.select("id", "data").write().format("iceberg").mode("append").save(tableLocation); long timestamp = System.currentTimeMillis(); long snapshotId = table.currentSnapshot().snapshotId(); Dataset<Row> df = spark.read() .format("iceberg") .option("snapshot-id", snapshotId) .option("as-of-timestamp", timestamp) .load(tableLocation); df.collectAsList(); }
Example 2
Source File: IcebergSourceNestedDataBenchmark.java From iceberg with Apache License 2.0 | 6 votes |
@Override protected final Table initTable() { Schema schema = new Schema( required(0, "id", Types.LongType.get()), optional(4, "nested", Types.StructType.of( required(1, "col1", Types.StringType.get()), required(2, "col2", Types.DoubleType.get()), required(3, "col3", Types.LongType.get()) )) ); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.METADATA_COMPRESSION, "gzip"); return tables.create(schema, partitionSpec, properties, newTableLocation()); }
Example 3
Source File: VectorizedReadFlatParquetDataBenchmark.java From iceberg with Apache License 2.0 | 6 votes |
@Override protected Table initTable() { Schema schema = new Schema( optional(1, "longCol", Types.LongType.get()), optional(2, "intCol", Types.IntegerType.get()), optional(3, "floatCol", Types.FloatType.get()), optional(4, "doubleCol", Types.DoubleType.get()), optional(5, "decimalCol", Types.DecimalType.of(20, 5)), optional(6, "dateCol", Types.DateType.get()), optional(7, "timestampCol", Types.TimestampType.withZone()), optional(8, "stringCol", Types.StringType.get())); PartitionSpec partitionSpec = PartitionSpec.unpartitioned(); HadoopTables tables = new HadoopTables(hadoopConf()); Map<String, String> properties = parquetWriteProps(); return tables.create(schema, partitionSpec, properties, newTableLocation()); }
Example 4
Source File: TestRewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testRewriteManifestsEmptyTable() throws IOException { PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); options.put(TableProperties.SNAPSHOT_ID_INHERITANCE_ENABLED, snapshotIdInheritanceEnabled); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); Assert.assertNull("Table must be empty", table.currentSnapshot()); Actions actions = Actions.forTable(table); actions.rewriteManifests() .rewriteIf(manifest -> true) .stagingLocation(temp.newFolder().toString()) .execute(); Assert.assertNull("Table must stay empty", table.currentSnapshot()); }
Example 5
Source File: HadoopTables.java From iceberg with Apache License 2.0 | 6 votes |
/** * Create a table using the FileSystem implementation resolve from * location. * * @param schema iceberg schema used to create the table * @param spec partitioning spec, if null the table will be unpartitioned * @param properties a string map of table properties, initialized to empty if null * @param location a path URI (e.g. hdfs:///warehouse/my_table) * @return newly created table implementation */ @Override public Table create(Schema schema, PartitionSpec spec, Map<String, String> properties, String location) { Preconditions.checkNotNull(schema, "A table schema is required"); TableOperations ops = newTableOps(location); if (ops.current() != null) { throw new AlreadyExistsException("Table already exists at location: " + location); } Map<String, String> tableProps = properties == null ? ImmutableMap.of() : properties; PartitionSpec partitionSpec = spec == null ? PartitionSpec.unpartitioned() : spec; TableMetadata metadata = TableMetadata.newTableMetadata(schema, partitionSpec, location, tableProps); ops.commit(null, metadata); return new BaseTable(ops, location); }
Example 6
Source File: TestSparkSchema.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testSparkReadSchemaCombinedWithProjection() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a") ); Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); originalDf.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()), new StructField("data", DataTypes.StringType, true, Metadata.empty()) } ); Dataset<Row> resultDf = spark.read() .schema(sparkReadSchema) .format("iceberg") .load(tableLocation) .select("id"); Row[] results = (Row[]) resultDf.collect(); Assert.assertEquals("Result size matches", 1, results.length); Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length()); Assert.assertEquals("Row content matches data", 1, results[0].getInt(0)); }
Example 7
Source File: TestSparkSchema.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testSparkReadSchemaIsHonored() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a") ); Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); originalDf.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { new StructField("id", DataTypes.IntegerType, true, Metadata.empty()) } ); Dataset<Row> resultDf = spark.read() .schema(sparkReadSchema) .format("iceberg") .load(tableLocation); Row[] results = (Row[]) resultDf.collect(); Assert.assertEquals("Result size matches", 1, results.length); Assert.assertEquals("Row length matches with sparkReadSchema", 1, results[0].length()); Assert.assertEquals("Row content matches data", 1, results[0].getInt(0)); }
Example 8
Source File: TestDataSourceOptions.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testDefaultMetadataSplitSize() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); tables.create(SCHEMA, spec, options, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b") ); Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); originalDf.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); int splitSize = (int) TableProperties.METADATA_SPLIT_SIZE_DEFAULT; // 32MB split size int expectedSplits = ((int) tables.load(tableLocation + "#entries") .currentSnapshot().allManifests().get(0).length() + splitSize - 1) / splitSize; Dataset<Row> metadataDf = spark.read() .format("iceberg") .load(tableLocation + "#entries"); int partitionNum = metadataDf.javaRDD().getNumPartitions(); Assert.assertEquals("Spark partitions should match", expectedSplits, partitionNum); }
Example 9
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testUnpartitionedOverwrite() throws IOException { File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id", "data").write() .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); // overwrite with the same data; should not produce two copies df.select("id", "data").write() .format("iceberg") .option("write-format", format.toString()) .mode("overwrite") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example 10
Source File: TestSparkSchema.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testFailIfSparkReadSchemaIsOff() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, null, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a") ); Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); originalDf.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); StructType sparkReadSchema = new StructType( new StructField[] { new StructField("idd", DataTypes.IntegerType, true, Metadata.empty()) // wrong field name } ); AssertHelpers.assertThrows("Iceberg should not allow a projection that contain unknown fields", java.lang.IllegalArgumentException.class, "Field idd not found in source schema", () -> spark.read() .schema(sparkReadSchema) .format("iceberg") .load(tableLocation) ); }
Example 11
Source File: TestDataSourceOptions.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testWriteFormatOptionOverridesTableProperties() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data").write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) { tasks.forEach(task -> { FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); Assert.assertEquals(FileFormat.PARQUET, fileFormat); }); } }
Example 12
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testWriteProjection() throws IOException { Assume.assumeTrue( "Not supported in Spark 3.0; analysis requires all columns are present", spark.version().startsWith("2")); File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Table table = tables.create(SCHEMA, spec, location.toString()); List<SimpleRecord> expected = Lists.newArrayList( new SimpleRecord(1, null), new SimpleRecord(2, null), new SimpleRecord(3, null) ); Dataset<Row> df = spark.createDataFrame(expected, SimpleRecord.class); df.select("id").write() // select only id column .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<SimpleRecord> actual = result.orderBy("id").as(Encoders.bean(SimpleRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example 13
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testNoMetricsCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertTrue(file.nullValueCounts().isEmpty()); Assert.assertTrue(file.valueCounts().isEmpty()); Assert.assertTrue(file.lowerBounds().isEmpty()); Assert.assertTrue(file.upperBounds().isEmpty()); } }
Example 14
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testCustomMetricCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); properties.put("write.metadata.metrics.column.id", "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); Schema schema = table.schema(); Types.NestedField id = schema.findField("id"); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertEquals(2, file.nullValueCounts().size()); Assert.assertEquals(2, file.valueCounts().size()); Assert.assertEquals(1, file.lowerBounds().size()); Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId())); Assert.assertEquals(1, file.upperBounds().size()); Assert.assertTrue(file.upperBounds().containsKey(id.fieldId())); } }
Example 15
Source File: TestSnapshotSelection.java From iceberg with Apache License 2.0 | 5 votes |
@Test(expected = IllegalArgumentException.class) public void testSnapshotSelectionByInvalidSnapshotId() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); tables.create(SCHEMA, spec, tableLocation); Dataset<Row> df = spark.read() .format("iceberg") .option("snapshot-id", -10) .load(tableLocation); df.collectAsList(); }
Example 16
Source File: ResidualEvaluator.java From iceberg with Apache License 2.0 | 4 votes |
UnpartitionedResidualEvaluator(Expression expr) { super(PartitionSpec.unpartitioned(), expr, false); this.expr = expr; }
Example 17
Source File: TestRewriteDataFilesAction.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testRewriteDataFilesUnpartitionedTable() { PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); Table table = TABLES.create(SCHEMA, spec, options, tableLocation); List<ThreeColumnRecord> records1 = Lists.newArrayList( new ThreeColumnRecord(1, null, "AAAA"), new ThreeColumnRecord(1, "BBBBBBBBBB", "BBBB") ); writeRecords(records1); List<ThreeColumnRecord> records2 = Lists.newArrayList( new ThreeColumnRecord(2, "CCCCCCCCCC", "CCCC"), new ThreeColumnRecord(2, "DDDDDDDDDD", "DDDD") ); writeRecords(records2); table.refresh(); CloseableIterable<FileScanTask> tasks = table.newScan().planFiles(); List<DataFile> dataFiles = Lists.newArrayList(CloseableIterable.transform(tasks, FileScanTask::file)); Assert.assertEquals("Should have 4 data files before rewrite", 4, dataFiles.size()); Actions actions = Actions.forTable(table); RewriteDataFilesActionResult result = actions.rewriteDataFiles().execute(); Assert.assertEquals("Action should rewrite 4 data files", 4, result.deletedDataFiles().size()); Assert.assertEquals("Action should add 1 data file", 1, result.addedDataFiles().size()); table.refresh(); CloseableIterable<FileScanTask> tasks1 = table.newScan().planFiles(); List<DataFile> dataFiles1 = Lists.newArrayList(CloseableIterable.transform(tasks1, FileScanTask::file)); Assert.assertEquals("Should have 1 data files before rewrite", 1, dataFiles1.size()); List<ThreeColumnRecord> expectedRecords = Lists.newArrayList(); expectedRecords.addAll(records1); expectedRecords.addAll(records2); Dataset<Row> resultDF = spark.read().format("iceberg").load(tableLocation); List<ThreeColumnRecord> actualRecords = resultDF.sort("c1", "c2") .as(Encoders.bean(ThreeColumnRecord.class)) .collectAsList(); Assert.assertEquals("Rows must match", expectedRecords, actualRecords); }
Example 18
Source File: TestDataSourceOptions.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testMetadataSplitSizeOptionOverrideTableProperties() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); Table table = tables.create(SCHEMA, spec, options, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b") ); Dataset<Row> originalDf = spark.createDataFrame(expectedRecords, SimpleRecord.class); // produce 1st manifest originalDf.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); // produce 2nd manifest originalDf.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); List<ManifestFile> manifests = table.currentSnapshot().allManifests(); Assert.assertEquals("Must be 2 manifests", 2, manifests.size()); // set the target metadata split size so each manifest ends up in a separate split table.updateProperties() .set(TableProperties.METADATA_SPLIT_SIZE, String.valueOf(manifests.get(0).length())) .commit(); Dataset<Row> entriesDf = spark.read() .format("iceberg") .load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 2, entriesDf.javaRDD().getNumPartitions()); // override the table property using options entriesDf = spark.read() .format("iceberg") .option("split-size", String.valueOf(128 * 1024 * 1024)) .load(tableLocation + "#entries"); Assert.assertEquals("Num partitions must match", 1, entriesDf.javaRDD().getNumPartitions()); }
Example 19
Source File: TestSparkDataWrite.java From iceberg with Apache License 2.0 | 4 votes |
@Test public void testWriteProjectionWithMiddle() throws IOException { Assume.assumeTrue( "Not supported in Spark 3.0; analysis requires all columns are present", spark.version().startsWith("2")); File parent = temp.newFolder(format.toString()); File location = new File(parent, "test"); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Schema schema = new Schema( optional(1, "c1", Types.IntegerType.get()), optional(2, "c2", Types.StringType.get()), optional(3, "c3", Types.StringType.get()) ); Table table = tables.create(schema, spec, location.toString()); List<ThreeColumnRecord> expected = Lists.newArrayList( new ThreeColumnRecord(1, null, "hello"), new ThreeColumnRecord(2, null, "world"), new ThreeColumnRecord(3, null, null) ); Dataset<Row> df = spark.createDataFrame(expected, ThreeColumnRecord.class); df.select("c1", "c3").write() .format("iceberg") .option("write-format", format.toString()) .mode("append") .save(location.toString()); table.refresh(); Dataset<Row> result = spark.read() .format("iceberg") .load(location.toString()); List<ThreeColumnRecord> actual = result.orderBy("c1").as(Encoders.bean(ThreeColumnRecord.class)).collectAsList(); Assert.assertEquals("Number of rows should match", expected.size(), actual.size()); Assert.assertEquals("Result rows should match", expected, actual); }
Example 20
Source File: Spark3Util.java From iceberg with Apache License 2.0 | 4 votes |
/** * Converts Spark transforms into a {@link PartitionSpec}. * * @param schema the table schema * @param partitioning Spark Transforms * @return a PartitionSpec */ public static PartitionSpec toPartitionSpec(Schema schema, Transform[] partitioning) { if (partitioning == null || partitioning.length == 0) { return PartitionSpec.unpartitioned(); } PartitionSpec.Builder builder = PartitionSpec.builderFor(schema); for (Transform transform : partitioning) { Preconditions.checkArgument(transform.references().length == 1, "Cannot convert transform with more than one column reference: %s", transform); String colName = DOT.join(transform.references()[0].fieldNames()); switch (transform.name()) { case "identity": builder.identity(colName); break; case "bucket": builder.bucket(colName, findWidth(transform)); break; case "years": builder.year(colName); break; case "months": builder.month(colName); break; case "date": case "days": builder.day(colName); break; case "date_hour": case "hours": builder.hour(colName); break; case "truncate": builder.truncate(colName, findWidth(transform)); break; default: throw new UnsupportedOperationException("Transform is not supported: " + transform); } } return builder.build(); }