org.apache.iceberg.DataFile Java Examples
The following examples show how to use
org.apache.iceberg.DataFile.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testProjection() throws Exception { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1)); Table table = tables.create(SCHEMA, SPEC, ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), location.toString()); List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L); DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords); table.newAppend() .appendFile(dataFile) .commit(); Job job = Job.getInstance(conf); IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job); configBuilder .readFrom(location.toString()) .project(projectedSchema); List<Record> outputRecords = readRecords(job.getConfiguration()); Assert.assertEquals(inputRecords.size(), outputRecords.size()); Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct()); }
Example #2
Source File: WriterCommitterOperator.java From dremio-oss with Apache License 2.0 | 6 votes |
@Override public void consumeData(int records) throws Exception { project.consumeData(records); if (icebergTableCommitter) { List<DataFile> icebergDatafiles = new ArrayList<>(); for (int i = 0; i < records; ++i) { DataFile dataFile = IcebergSerDe.deserializeDataFile(icebergMetadataVector.get(i)); icebergDatafiles.add(dataFile); } if (icebergDatafiles.size() > 0) { try (AutoCloseable ac = OperatorStats.getWaitRecorder(context.getStats())) { icebergOpCommitter.consumeData(icebergDatafiles); } } } recordCount += records; }
Example #3
Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testCreateTableTxnWithGlobalTableLocation() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); Transaction txn = catalog.newCreateTableTransaction( TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap()); txn.commitTransaction(); Table table = catalog.loadTable(TABLE_IDENTIFIER); DataFile dataFile = DataFiles.builder(SPEC) .withPath("/path/to/data-a.parquet") .withFileSizeInBytes(0) .withRecordCount(1) .build(); table.newAppend() .appendFile(dataFile) .commit(); Assert.assertEquals("Write should succeed", 1, Iterables.size(table.snapshots())); }
Example #4
Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testCreateTableTxnAndAppend() { Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER)); Transaction txn = catalog.newCreateTableTransaction( TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap()); AppendFiles append = txn.newAppend(); DataFile dataFile = DataFiles.builder(SPEC) .withPath("/path/to/data-a.parquet") .withFileSizeInBytes(0) .withRecordCount(1) .build(); append.appendFile(dataFile); append.commit(); txn.commitTransaction(); Table table = catalog.loadTable(TABLE_IDENTIFIER); Snapshot snapshot = table.currentSnapshot(); Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1); }
Example #5
Source File: BaseWriter.java From iceberg with Apache License 2.0 | 6 votes |
protected void closeCurrent() throws IOException { if (currentAppender != null) { currentAppender.close(); // metrics are only valid after the appender is closed Metrics metrics = currentAppender.metrics(); long fileSizeInBytes = currentAppender.length(); List<Long> splitOffsets = currentAppender.splitOffsets(); this.currentAppender = null; if (metrics.recordCount() == 0L) { io.deleteFile(currentFile.encryptingOutputFile()); } else { DataFile dataFile = DataFiles.builder(spec) .withEncryptionKeyMetadata(currentFile.keyMetadata()) .withPath(currentFile.encryptingOutputFile().location()) .withFileSizeInBytes(fileSizeInBytes) .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned .withMetrics(metrics) .withSplitOffsets(splitOffsets) .build(); completedFiles.add(dataFile); } this.currentFile = null; } }
Example #6
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 6 votes |
@Override CloseableIterator<InternalRow> open(FileScanTask task) { DataFile file = task.file(); // update the current file for Spark's filename() function InputFileBlockHolder.set(file.path().toString(), task.start(), task.length()); // schema or rows returned by readers PartitionSpec spec = task.spec(); Set<Integer> idColumns = spec.identitySourceIds(); Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns); boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty(); if (projectsIdentityPartitionColumns) { return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant)) .iterator(); } // return the base iterator return open(task, expectedSchema, ImmutableMap.of()).iterator(); }
Example #7
Source File: SparkTableUtil.java From iceberg with Apache License 2.0 | 6 votes |
private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec, String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) { if (fileTuples.hasNext()) { FileIO io = new HadoopFileIO(conf.get()); TaskContext ctx = TaskContext.get(); String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); Path location = new Path(basePath, suffix); String outputPath = FileFormat.AVRO.addExtension(location.toString()); OutputFile outputFile = io.newOutputFile(outputPath); ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile); try (ManifestWriter<DataFile> writerRef = writer) { fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); } catch (IOException e) { throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath); } ManifestFile manifestFile = writer.toManifestFile(); return ImmutableList.of(manifestFile).iterator(); } else { return Collections.emptyIterator(); } }
Example #8
Source File: RewriteDataFilesAction.java From iceberg with Apache License 2.0 | 6 votes |
private void replaceDataFiles(Iterable<DataFile> deletedDataFiles, Iterable<DataFile> addedDataFiles) { try { RewriteFiles rewriteFiles = table.newRewrite(); rewriteFiles.rewriteFiles(Sets.newHashSet(deletedDataFiles), Sets.newHashSet(addedDataFiles)); commit(rewriteFiles); } catch (Exception e) { Tasks.foreach(Iterables.transform(addedDataFiles, f -> f.path().toString())) .noRetry() .suppressFailureWhenFinished() .onFailure((location, exc) -> LOG.warn("Failed to delete: {}", location, exc)) .run(fileIO::deleteFile); throw e; } }
Example #9
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testCustomCatalog() throws Exception { conf = new Configuration(); conf.set("warehouse.location", temp.newFolder("hadoop_catalog").getAbsolutePath()); Catalog catalog = new HadoopCatalogFunc().apply(conf); TableIdentifier tableIdentifier = TableIdentifier.of("db", "t"); Table table = catalog.createTable(tableIdentifier, SCHEMA, SPEC, ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name())); List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L); expectedRecords.get(0).set(2, "2020-03-20"); DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords); table.newAppend() .appendFile(dataFile) .commit(); Job job = Job.getInstance(conf); IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job); configBuilder .catalogFunc(HadoopCatalogFunc.class) .readFrom(tableIdentifier.toString()); validate(job, expectedRecords); }
Example #10
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) { DataFile file = currentTask.file(); // TODO we should make use of FileIO to create inputFile InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration()); CloseableIterable<T> iterable; switch (file.format()) { case AVRO: iterable = newAvroIterable(inputFile, currentTask, readSchema); break; case ORC: iterable = newOrcIterable(inputFile, currentTask, readSchema); break; case PARQUET: iterable = newParquetIterable(inputFile, currentTask, readSchema); break; default: throw new UnsupportedOperationException( String.format("Cannot read %s file: %s", file.format().name(), file.path())); } return iterable; }
Example #11
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedTable() throws Exception { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(), ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), location.toString()); List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L); DataFile dataFile = writeFile(table, null, format, expectedRecords); table.newAppend() .appendFile(dataFile) .commit(); Job job = Job.getInstance(conf); IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job); configBuilder.readFrom(location.toString()); validate(job, expectedRecords); }
Example #12
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testPartitionedTable() throws Exception { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); Table table = tables.create(SCHEMA, SPEC, ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), location.toString()); List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L); expectedRecords.get(0).set(2, "2020-03-20"); DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords); table.newAppend() .appendFile(dataFile) .commit(); Job job = Job.getInstance(conf); IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job); configBuilder.readFrom(location.toString()); validate(job, expectedRecords); }
Example #13
Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testFilterExp() throws Exception { File location = temp.newFolder(format.name()); Assert.assertTrue(location.delete()); Table table = tables.create(SCHEMA, SPEC, ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()), location.toString()); List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L); expectedRecords.get(0).set(2, "2020-03-20"); expectedRecords.get(1).set(2, "2020-03-20"); DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords); DataFile dataFile2 = writeFile(table, Row.of("2020-03-21", 0), format, RandomGenericData.generate(table.schema(), 2, 0L)); table.newAppend() .appendFile(dataFile1) .appendFile(dataFile2) .commit(); Job job = Job.getInstance(conf); IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job); configBuilder.readFrom(location.toString()) .filter(Expressions.equal("date", "2020-03-20")); validate(job, expectedRecords); }
Example #14
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testCustomMetricCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); properties.put("write.metadata.metrics.column.id", "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); Schema schema = table.schema(); Types.NestedField id = schema.findField("id"); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertEquals(2, file.nullValueCounts().size()); Assert.assertEquals(2, file.valueCounts().size()); Assert.assertEquals(1, file.lowerBounds().size()); Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId())); Assert.assertEquals(1, file.upperBounds().size()); Assert.assertTrue(file.upperBounds().containsKey(id.fieldId())); } }
Example #15
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testNoMetricsCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertTrue(file.nullValueCounts().isEmpty()); Assert.assertTrue(file.valueCounts().isEmpty()); Assert.assertTrue(file.lowerBounds().isEmpty()); Assert.assertTrue(file.upperBounds().isEmpty()); } }
Example #16
Source File: TestIcebergTableDrop.java From dremio-oss with Apache License 2.0 | 5 votes |
private DataFile createDataFile(File dir, String fileName) throws Exception { File dataFile = new File(dir, fileName); URI resource = Resources.getResource( "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI(); Files.copy(Paths.get(resource), dataFile.toPath()); return DataFiles.builder(PartitionSpec.builderFor(schema).build()) .withInputFile(org.apache.iceberg.Files.localInput(dataFile)) .withRecordCount(25) .withFormat(FileFormat.PARQUET) .build(); }
Example #17
Source File: TestIcebergManifests.java From dremio-oss with Apache License 2.0 | 5 votes |
List<DataFile> getDataFiles(PartitionSpec partitionSpec, int partitionValueSize, int dataFilesCount, String columnName) { List<DataFile> dataFiles = new ArrayList<>(); for( int i=0; i<dataFilesCount; ++i) { String partitionValue = RandomStringUtils.randomAlphanumeric(partitionValueSize); String datafileName = RandomStringUtils.randomAlphanumeric(64); dataFiles.add(DataFiles.builder(partitionSpec) .withInputFile(Files.localInput(datafileName+".parquet")) .withRecordCount(50) .withFormat(FileFormat.PARQUET) .withPartitionPath(columnName+"="+partitionValue) .build()); } return dataFiles; }
Example #18
Source File: RowDataRewriter.java From iceberg with Apache License 2.0 | 5 votes |
public List<DataFile> rewriteDataForTasks(JavaRDD<CombinedScanTask> taskRDD) { JavaRDD<TaskResult> taskCommitRDD = taskRDD.map(this::rewriteDataForTask); return taskCommitRDD.collect().stream() .flatMap(taskCommit -> Arrays.stream(taskCommit.files())) .collect(Collectors.toList()); }
Example #19
Source File: TestSparkDataFile.java From iceberg with Apache License 2.0 | 5 votes |
private void checkSparkDataFile(Table table) throws IOException { Iterable<InternalRow> rows = RandomData.generateSpark(table.schema(), 200, 0); JavaRDD<InternalRow> rdd = sparkContext.parallelize(Lists.newArrayList(rows)); Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false); df.write().format("iceberg").mode("append").save(tableLocation); table.refresh(); List<ManifestFile> manifests = table.currentSnapshot().allManifests(); Assert.assertEquals("Should have 1 manifest", 1, manifests.size()); List<DataFile> dataFiles = Lists.newArrayList(); try (ManifestReader<DataFile> reader = ManifestFiles.read(manifests.get(0), table.io())) { reader.forEach(dataFile -> dataFiles.add(dataFile.copy())); } Dataset<Row> dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files"); // reorder columns to test arbitrary projections List<Column> columns = Arrays.stream(dataFileDF.columns()) .map(ColumnName::new) .collect(Collectors.toList()); Collections.shuffle(columns); List<Row> sparkDataFiles = dataFileDF .select(Iterables.toArray(columns, Column.class)) .collectAsList(); Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size()); Types.StructType dataFileType = DataFile.getType(table.spec().partitionType()); StructType sparkDataFileType = sparkDataFiles.get(0).schema(); SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkDataFileType); for (int i = 0; i < dataFiles.size(); i++) { checkDataFile(dataFiles.get(i), wrapper.wrap(sparkDataFiles.get(i))); } }
Example #20
Source File: TestSparkDataFile.java From iceberg with Apache License 2.0 | 5 votes |
private void checkDataFile(DataFile expected, DataFile actual) { Assert.assertEquals("Path must match", expected.path(), actual.path()); Assert.assertEquals("Format must match", expected.format(), actual.format()); Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount()); Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes()); Assert.assertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts()); Assert.assertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts()); Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds()); Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds()); Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata()); Assert.assertEquals("Split offsets must match", expected.splitOffsets(), actual.splitOffsets()); checkStructLike(expected.partition(), actual.partition()); }
Example #21
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testFullMetricsCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertEquals(2, file.nullValueCounts().size()); Assert.assertEquals(2, file.valueCounts().size()); Assert.assertEquals(2, file.lowerBounds().size()); Assert.assertEquals(2, file.upperBounds().size()); } }
Example #22
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testCountMetricsCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertEquals(2, file.nullValueCounts().size()); Assert.assertEquals(2, file.valueCounts().size()); Assert.assertTrue(file.lowerBounds().isEmpty()); Assert.assertTrue(file.upperBounds().isEmpty()); } }
Example #23
Source File: IcebergCatalog.java From dremio-oss with Apache License 2.0 | 5 votes |
public void consumeData(List<DataFile> filesList) { Preconditions.checkState(transaction != null, "Transaction was not started"); Preconditions.checkState(appendFiles != null, "Transaction was not started"); filesList .stream() .forEach(x -> appendFiles.appendFile(x)); // adds the current update to the transaction. It will be marked as // pending commit inside transaction. Final commit on transaction in end method // makes these files become part of the table }
Example #24
Source File: SparkBatchWrite.java From iceberg with Apache License 2.0 | 5 votes |
private void replacePartitions(WriterCommitMessage[] messages) { ReplacePartitions dynamicOverwrite = table.newReplacePartitions(); int numFiles = 0; for (DataFile file : files(messages)) { numFiles += 1; dynamicOverwrite.addFile(file); } commitOperation(dynamicOverwrite, numFiles, "dynamic partition overwrite"); }
Example #25
Source File: TestRefresh.java From dremio-oss with Apache License 2.0 | 5 votes |
private DataFile createDataFile(File dir, String fileName) throws Exception { File dataFile = new File(dir, fileName); URI resource = Resources.getResource( "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI(); Files.copy(Paths.get(resource), dataFile.toPath()); return DataFiles.builder(PartitionSpec.builderFor(schema).build()) .withInputFile(org.apache.iceberg.Files.localInput(dataFile)) .withRecordCount(25) .withFormat(FileFormat.PARQUET) .build(); }
Example #26
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 5 votes |
private static ManifestFile writeManifest( List<Row> rows, int startIndex, int endIndex, Broadcast<FileIO> io, String location, int format, PartitionSpec spec, StructType sparkType) throws IOException { String manifestName = "optimized-m-" + UUID.randomUUID(); Path manifestPath = new Path(location, manifestName); OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString())); Types.StructType dataFileType = DataFile.getType(spec.partitionType()); SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType); ManifestWriter writer = ManifestFiles.write(format, spec, outputFile, null); try { for (int index = startIndex; index < endIndex; index++) { Row row = rows.get(index); long snapshotId = row.getLong(0); long sequenceNumber = row.getLong(1); Row file = row.getStruct(2); writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber); } } finally { writer.close(); } return writer.toManifestFile(); }
Example #27
Source File: SparkBatchWrite.java From iceberg with Apache License 2.0 | 5 votes |
private void overwrite(WriterCommitMessage[] messages) { OverwriteFiles overwriteFiles = table.newOverwrite(); overwriteFiles.overwriteByRowFilter(overwriteExpr); int numFiles = 0; for (DataFile file : files(messages)) { numFiles += 1; overwriteFiles.addFile(file); } commitOperation(overwriteFiles, numFiles, "overwrite by filter"); }
Example #28
Source File: SparkBatchWrite.java From iceberg with Apache License 2.0 | 5 votes |
protected Iterable<DataFile> files(WriterCommitMessage[] messages) { if (messages.length > 0) { return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ? ImmutableList.copyOf(((TaskCommit) message).files()) : ImmutableList.of())); } return ImmutableList.of(); }
Example #29
Source File: TestIcebergSerDe.java From dremio-oss with Apache License 2.0 | 5 votes |
@Test public void testDataFileSerDe() throws Exception{ File dataFile = new File(folder.getRoot(), "a.parquet"); dataFile.createNewFile(); PartitionSpec partitionSpec = PartitionSpec .builderFor(schema) .identity("i") .identity("data") .build(); IcebergPartitionData icebergPartitionData = new IcebergPartitionData(partitionSpec.partitionType()); icebergPartitionData.set(0, Integer.valueOf(10)); icebergPartitionData.set(1, "def"); DataFile d1 = DataFiles.builder(partitionSpec) .withInputFile(Files.localInput(dataFile)) .withRecordCount(50) .withFormat(FileFormat.PARQUET) .withPartition(icebergPartitionData) .build(); long d1RecordCount = d1.recordCount(); byte[] dataFileBytes = IcebergSerDe.serializeDataFile(d1); DataFile d2 = IcebergSerDe.deserializeDataFile(dataFileBytes); long d2RecordCount = d2.recordCount(); Assert.assertEquals(d1RecordCount, d2RecordCount); Assert.assertEquals((Integer)(d2.partition().get(0, Integer.class)), Integer.valueOf(10)); Assert.assertEquals((String)(d2.partition().get(1, String.class)), "def"); }
Example #30
Source File: SparkTableUtil.java From iceberg with Apache License 2.0 | 5 votes |
private static List<DataFile> listAvroPartition( Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) { try { Path partition = new Path(partitionUri); FileSystem fs = partition.getFileSystem(conf); return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER)) .filter(FileStatus::isFile) .map(stat -> { Metrics metrics = new Metrics(-1L, null, null, null); String partitionKey = spec.fields().stream() .map(PartitionField::name) .map(name -> String.format("%s=%s", name, partitionPath.get(name))) .collect(Collectors.joining("/")); return DataFiles.builder(spec) .withPath(stat.getPath().toString()) .withFormat("avro") .withFileSizeInBytes(stat.getLen()) .withMetrics(metrics) .withPartitionPath(partitionKey) .build(); }).collect(Collectors.toList()); } catch (IOException e) { throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri); } }