org.apache.iceberg.DataFile Java Exaples

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testProjection() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Schema projectedSchema = TypeUtil.select(SCHEMA, ImmutableSet.of(1));
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> inputRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, inputRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .readFrom(location.toString())
      .project(projectedSchema);
  List<Record> outputRecords = readRecords(job.getConfiguration());
  Assert.assertEquals(inputRecords.size(), outputRecords.size());
  Assert.assertEquals(projectedSchema.asStruct(), outputRecords.get(0).struct());
}

Source File: WriterCommitterOperator.java From dremio-oss with Apache License 2.0

6 votes

@Override
public void consumeData(int records) throws Exception {
  project.consumeData(records);
  if (icebergTableCommitter) {
    List<DataFile> icebergDatafiles = new ArrayList<>();
    for (int i = 0; i < records; ++i) {
      DataFile dataFile = IcebergSerDe.deserializeDataFile(icebergMetadataVector.get(i));
      icebergDatafiles.add(dataFile);
    }
    if (icebergDatafiles.size() > 0) {
      try (AutoCloseable ac = OperatorStats.getWaitRecorder(context.getStats())) {
        icebergOpCommitter.consumeData(icebergDatafiles);
      }
    }
  }
  recordCount += records;
}

Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0

6 votes

@Test
public void testCreateTableTxnWithGlobalTableLocation() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, "file:///" + tableLocation, Maps.newHashMap());
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);

  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();

  table.newAppend()
      .appendFile(dataFile)
      .commit();

  Assert.assertEquals("Write should succeed", 1, Iterables.size(table.snapshots()));
}

Source File: HiveCreateReplaceTableTest.java From iceberg with Apache License 2.0

6 votes

@Test
public void testCreateTableTxnAndAppend() {
  Assert.assertFalse("Table should not exist", catalog.tableExists(TABLE_IDENTIFIER));

  Transaction txn = catalog.newCreateTableTransaction(
      TABLE_IDENTIFIER, SCHEMA, SPEC, tableLocation, Maps.newHashMap());

  AppendFiles append = txn.newAppend();
  DataFile dataFile = DataFiles.builder(SPEC)
      .withPath("/path/to/data-a.parquet")
      .withFileSizeInBytes(0)
      .withRecordCount(1)
      .build();
  append.appendFile(dataFile);
  append.commit();
  txn.commitTransaction();

  Table table = catalog.loadTable(TABLE_IDENTIFIER);
  Snapshot snapshot = table.currentSnapshot();
  Assert.assertTrue("Table should have one manifest file", snapshot.allManifests().size() == 1);
}

Source File: BaseWriter.java From iceberg with Apache License 2.0

6 votes

protected void closeCurrent() throws IOException {
  if (currentAppender != null) {
    currentAppender.close();
    // metrics are only valid after the appender is closed
    Metrics metrics = currentAppender.metrics();
    long fileSizeInBytes = currentAppender.length();
    List<Long> splitOffsets = currentAppender.splitOffsets();
    this.currentAppender = null;

    if (metrics.recordCount() == 0L) {
      io.deleteFile(currentFile.encryptingOutputFile());
    } else {
      DataFile dataFile = DataFiles.builder(spec)
          .withEncryptionKeyMetadata(currentFile.keyMetadata())
          .withPath(currentFile.encryptingOutputFile().location())
          .withFileSizeInBytes(fileSizeInBytes)
          .withPartition(spec.fields().size() == 0 ? null : currentKey) // set null if unpartitioned
          .withMetrics(metrics)
          .withSplitOffsets(splitOffsets)
          .build();
      completedFiles.add(dataFile);
    }

    this.currentFile = null;
  }
}

Source File: RowDataReader.java From iceberg with Apache License 2.0

6 votes

@Override
CloseableIterator<InternalRow> open(FileScanTask task) {
  DataFile file = task.file();

  // update the current file for Spark's filename() function
  InputFileBlockHolder.set(file.path().toString(), task.start(), task.length());

  // schema or rows returned by readers
  PartitionSpec spec = task.spec();
  Set<Integer> idColumns = spec.identitySourceIds();
  Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns);
  boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty();

  if (projectsIdentityPartitionColumns) {
    return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant))
        .iterator();
  }
  // return the base iterator
  return open(task, expectedSchema, ImmutableMap.of()).iterator();
}

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

6 votes

private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}

Source File: RewriteDataFilesAction.java From iceberg with Apache License 2.0

6 votes

private void replaceDataFiles(Iterable<DataFile> deletedDataFiles, Iterable<DataFile> addedDataFiles) {
  try {
    RewriteFiles rewriteFiles = table.newRewrite();
    rewriteFiles.rewriteFiles(Sets.newHashSet(deletedDataFiles), Sets.newHashSet(addedDataFiles));
    commit(rewriteFiles);

  } catch (Exception e) {
    Tasks.foreach(Iterables.transform(addedDataFiles, f -> f.path().toString()))
        .noRetry()
        .suppressFailureWhenFinished()
        .onFailure((location, exc) -> LOG.warn("Failed to delete: {}", location, exc))
        .run(fileIO::deleteFile);

    throw e;
  }
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testCustomCatalog() throws Exception {
  conf = new Configuration();
  conf.set("warehouse.location", temp.newFolder("hadoop_catalog").getAbsolutePath());

  Catalog catalog = new HadoopCatalogFunc().apply(conf);
  TableIdentifier tableIdentifier = TableIdentifier.of("db", "t");
  Table table = catalog.createTable(tableIdentifier, SCHEMA, SPEC,
                                    ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()));
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder
      .catalogFunc(HadoopCatalogFunc.class)
      .readFrom(tableIdentifier.toString());
  validate(job, expectedRecords);
}

Source File: IcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testUnpartitionedTable() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, PartitionSpec.unpartitioned(),
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  DataFile dataFile = writeFile(table, null, format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();
  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString());
  validate(job, expectedRecords);
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testPartitionedTable() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 1, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  DataFile dataFile = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  table.newAppend()
       .appendFile(dataFile)
       .commit();

  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString());
  validate(job, expectedRecords);
}

Source File: TestIcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

@Test
public void testFilterExp() throws Exception {
  File location = temp.newFolder(format.name());
  Assert.assertTrue(location.delete());
  Table table = tables.create(SCHEMA, SPEC,
                              ImmutableMap.of(TableProperties.DEFAULT_FILE_FORMAT, format.name()),
                              location.toString());
  List<Record> expectedRecords = RandomGenericData.generate(table.schema(), 2, 0L);
  expectedRecords.get(0).set(2, "2020-03-20");
  expectedRecords.get(1).set(2, "2020-03-20");
  DataFile dataFile1 = writeFile(table, Row.of("2020-03-20", 0), format, expectedRecords);
  DataFile dataFile2 = writeFile(table, Row.of("2020-03-21", 0), format,
                                 RandomGenericData.generate(table.schema(), 2, 0L));
  table.newAppend()
       .appendFile(dataFile1)
       .appendFile(dataFile2)
       .commit();
  Job job = Job.getInstance(conf);
  IcebergInputFormat.ConfigBuilder configBuilder = IcebergInputFormat.configure(job);
  configBuilder.readFrom(location.toString())
               .filter(Expressions.equal("date", "2020-03-20"));
  validate(job, expectedRecords);
}

Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0

5 votes

@Test
public void testCustomMetricCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  properties.put("write.metadata.metrics.column.id", "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  Schema schema = table.schema();
  Types.NestedField id = schema.findField("id");
  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(1, file.lowerBounds().size());
    Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId()));
    Assert.assertEquals(1, file.upperBounds().size());
    Assert.assertTrue(file.upperBounds().containsKey(id.fieldId()));
  }
}

Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0

5 votes

@Test
public void testNoMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertTrue(file.nullValueCounts().isEmpty());
    Assert.assertTrue(file.valueCounts().isEmpty());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}

Source File: TestIcebergTableDrop.java From dremio-oss with Apache License 2.0

5 votes

private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}

Source File: TestIcebergManifests.java From dremio-oss with Apache License 2.0

5 votes

List<DataFile> getDataFiles(PartitionSpec partitionSpec, int partitionValueSize, int dataFilesCount, String columnName) {
  List<DataFile> dataFiles = new ArrayList<>();
  for( int i=0; i<dataFilesCount; ++i) {
    String partitionValue = RandomStringUtils.randomAlphanumeric(partitionValueSize);
    String datafileName = RandomStringUtils.randomAlphanumeric(64);
    dataFiles.add(DataFiles.builder(partitionSpec)
      .withInputFile(Files.localInput(datafileName+".parquet"))
      .withRecordCount(50)
      .withFormat(FileFormat.PARQUET)
      .withPartitionPath(columnName+"="+partitionValue)
      .build());
  }
  return dataFiles;
}

Source File: RowDataRewriter.java From iceberg with Apache License 2.0

5 votes

public List<DataFile> rewriteDataForTasks(JavaRDD<CombinedScanTask> taskRDD) {
  JavaRDD<TaskResult> taskCommitRDD = taskRDD.map(this::rewriteDataForTask);

  return taskCommitRDD.collect().stream()
      .flatMap(taskCommit -> Arrays.stream(taskCommit.files()))
      .collect(Collectors.toList());
}

Source File: TestSparkDataFile.java From iceberg with Apache License 2.0

5 votes

private void checkSparkDataFile(Table table) throws IOException {
  Iterable<InternalRow> rows = RandomData.generateSpark(table.schema(), 200, 0);
  JavaRDD<InternalRow> rdd = sparkContext.parallelize(Lists.newArrayList(rows));
  Dataset<Row> df = spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), SparkSchemaUtil.convert(table.schema()), false);

  df.write().format("iceberg").mode("append").save(tableLocation);

  table.refresh();

  List<ManifestFile> manifests = table.currentSnapshot().allManifests();
  Assert.assertEquals("Should have 1 manifest", 1, manifests.size());

  List<DataFile> dataFiles = Lists.newArrayList();
  try (ManifestReader<DataFile> reader = ManifestFiles.read(manifests.get(0), table.io())) {
    reader.forEach(dataFile -> dataFiles.add(dataFile.copy()));
  }

  Dataset<Row> dataFileDF = spark.read().format("iceberg").load(tableLocation + "#files");

  // reorder columns to test arbitrary projections
  List<Column> columns = Arrays.stream(dataFileDF.columns())
      .map(ColumnName::new)
      .collect(Collectors.toList());
  Collections.shuffle(columns);

  List<Row> sparkDataFiles = dataFileDF
      .select(Iterables.toArray(columns, Column.class))
      .collectAsList();

  Assert.assertEquals("The number of files should match", dataFiles.size(), sparkDataFiles.size());

  Types.StructType dataFileType = DataFile.getType(table.spec().partitionType());
  StructType sparkDataFileType = sparkDataFiles.get(0).schema();
  SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkDataFileType);

  for (int i = 0; i < dataFiles.size(); i++) {
    checkDataFile(dataFiles.get(i), wrapper.wrap(sparkDataFiles.get(i)));
  }
}

Source File: TestSparkDataFile.java From iceberg with Apache License 2.0

5 votes

private void checkDataFile(DataFile expected, DataFile actual) {
  Assert.assertEquals("Path must match", expected.path(), actual.path());
  Assert.assertEquals("Format must match", expected.format(), actual.format());
  Assert.assertEquals("Record count must match", expected.recordCount(), actual.recordCount());
  Assert.assertEquals("Size must match", expected.fileSizeInBytes(), actual.fileSizeInBytes());
  Assert.assertEquals("Record value counts must match", expected.valueCounts(), actual.valueCounts());
  Assert.assertEquals("Record null value counts must match", expected.nullValueCounts(), actual.nullValueCounts());
  Assert.assertEquals("Lower bounds must match", expected.lowerBounds(), actual.lowerBounds());
  Assert.assertEquals("Upper bounds must match", expected.upperBounds(), actual.upperBounds());
  Assert.assertEquals("Key metadata must match", expected.keyMetadata(), actual.keyMetadata());
  Assert.assertEquals("Split offsets must match", expected.splitOffsets(), actual.splitOffsets());

  checkStructLike(expected.partition(), actual.partition());
}

Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0

5 votes

@Test
public void testFullMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertEquals(2, file.lowerBounds().size());
    Assert.assertEquals(2, file.upperBounds().size());
  }
}

Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0

5 votes

@Test
public void testCountMetricsCollectionForParquet() throws IOException {
  String tableLocation = temp.newFolder("iceberg-table").toString();

  HadoopTables tables = new HadoopTables(CONF);
  PartitionSpec spec = PartitionSpec.unpartitioned();
  Map<String, String> properties = Maps.newHashMap();
  properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts");
  Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation);

  List<SimpleRecord> expectedRecords = Lists.newArrayList(
      new SimpleRecord(1, "a"),
      new SimpleRecord(2, "b"),
      new SimpleRecord(3, "c")
  );
  Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class);
  df.select("id", "data")
      .coalesce(1)
      .write()
      .format("iceberg")
      .option("write-format", "parquet")
      .mode("append")
      .save(tableLocation);

  for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) {
    DataFile file = task.file();
    Assert.assertEquals(2, file.nullValueCounts().size());
    Assert.assertEquals(2, file.valueCounts().size());
    Assert.assertTrue(file.lowerBounds().isEmpty());
    Assert.assertTrue(file.upperBounds().isEmpty());
  }
}

Source File: IcebergCatalog.java From dremio-oss with Apache License 2.0

5 votes

public void consumeData(List<DataFile> filesList) {
  Preconditions.checkState(transaction != null, "Transaction was not started");
  Preconditions.checkState(appendFiles != null, "Transaction was not started");


  filesList
    .stream()
    .forEach(x -> appendFiles.appendFile(x));

  // adds the current update to the transaction. It will be marked as
  // pending commit inside transaction. Final commit on transaction in end method
  // makes these files become part of the table

}

Source File: SparkBatchWrite.java From iceberg with Apache License 2.0

5 votes

private void replacePartitions(WriterCommitMessage[] messages) {
  ReplacePartitions dynamicOverwrite = table.newReplacePartitions();

  int numFiles = 0;
  for (DataFile file : files(messages)) {
    numFiles += 1;
    dynamicOverwrite.addFile(file);
  }

  commitOperation(dynamicOverwrite, numFiles, "dynamic partition overwrite");
}

Source File: TestRefresh.java From dremio-oss with Apache License 2.0

5 votes

private DataFile createDataFile(File dir, String fileName) throws Exception {
  File dataFile = new File(dir, fileName);
  URI resource = Resources.getResource(
    "iceberg/nation/data/00000-1-a9e8d979-a183-40c5-af3d-a338ab62be8b-00000.parquet").toURI();
  Files.copy(Paths.get(resource), dataFile.toPath());

  return DataFiles.builder(PartitionSpec.builderFor(schema).build())
    .withInputFile(org.apache.iceberg.Files.localInput(dataFile))
    .withRecordCount(25)
    .withFormat(FileFormat.PARQUET)
    .build();
}

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

5 votes

private static ManifestFile writeManifest(
    List<Row> rows, int startIndex, int endIndex, Broadcast<FileIO> io,
    String location, int format, PartitionSpec spec, StructType sparkType) throws IOException {

  String manifestName = "optimized-m-" + UUID.randomUUID();
  Path manifestPath = new Path(location, manifestName);
  OutputFile outputFile = io.value().newOutputFile(FileFormat.AVRO.addExtension(manifestPath.toString()));

  Types.StructType dataFileType = DataFile.getType(spec.partitionType());
  SparkDataFile wrapper = new SparkDataFile(dataFileType, sparkType);

  ManifestWriter writer = ManifestFiles.write(format, spec, outputFile, null);

  try {
    for (int index = startIndex; index < endIndex; index++) {
      Row row = rows.get(index);
      long snapshotId = row.getLong(0);
      long sequenceNumber = row.getLong(1);
      Row file = row.getStruct(2);
      writer.existing(wrapper.wrap(file), snapshotId, sequenceNumber);
    }
  } finally {
    writer.close();
  }

  return writer.toManifestFile();
}

Source File: SparkBatchWrite.java From iceberg with Apache License 2.0

5 votes

private void overwrite(WriterCommitMessage[] messages) {
  OverwriteFiles overwriteFiles = table.newOverwrite();
  overwriteFiles.overwriteByRowFilter(overwriteExpr);

  int numFiles = 0;
  for (DataFile file : files(messages)) {
    numFiles += 1;
    overwriteFiles.addFile(file);
  }

  commitOperation(overwriteFiles, numFiles, "overwrite by filter");
}

Source File: SparkBatchWrite.java From iceberg with Apache License 2.0

5 votes

protected Iterable<DataFile> files(WriterCommitMessage[] messages) {
  if (messages.length > 0) {
    return Iterables.concat(Iterables.transform(Arrays.asList(messages), message -> message != null ?
        ImmutableList.copyOf(((TaskCommit) message).files()) :
        ImmutableList.of()));
  }
  return ImmutableList.of();
}

Source File: TestIcebergSerDe.java From dremio-oss with Apache License 2.0

5 votes

@Test
public void testDataFileSerDe() throws Exception{
  File dataFile = new File(folder.getRoot(), "a.parquet");
  dataFile.createNewFile();

  PartitionSpec partitionSpec = PartitionSpec
    .builderFor(schema)
    .identity("i")
    .identity("data")
    .build();

  IcebergPartitionData icebergPartitionData = new IcebergPartitionData(partitionSpec.partitionType());
  icebergPartitionData.set(0, Integer.valueOf(10));
  icebergPartitionData.set(1, "def");

  DataFile d1 = DataFiles.builder(partitionSpec)
    .withInputFile(Files.localInput(dataFile))
    .withRecordCount(50)
    .withFormat(FileFormat.PARQUET)
    .withPartition(icebergPartitionData)
    .build();

  long d1RecordCount = d1.recordCount();
  byte[] dataFileBytes = IcebergSerDe.serializeDataFile(d1);
  DataFile d2 = IcebergSerDe.deserializeDataFile(dataFileBytes);
  long d2RecordCount = d2.recordCount();
  Assert.assertEquals(d1RecordCount, d2RecordCount);
  Assert.assertEquals((Integer)(d2.partition().get(0, Integer.class)), Integer.valueOf(10));
  Assert.assertEquals((String)(d2.partition().get(1, String.class)), "def");
}

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

5 votes

private static List<DataFile> listAvroPartition(
    Map<String, String> partitionPath, String partitionUri, PartitionSpec spec, Configuration conf) {
  try {
    Path partition = new Path(partitionUri);
    FileSystem fs = partition.getFileSystem(conf);
    return Arrays.stream(fs.listStatus(partition, HIDDEN_PATH_FILTER))
        .filter(FileStatus::isFile)
        .map(stat -> {
          Metrics metrics = new Metrics(-1L, null, null, null);
          String partitionKey = spec.fields().stream()
              .map(PartitionField::name)
              .map(name -> String.format("%s=%s", name, partitionPath.get(name)))
              .collect(Collectors.joining("/"));

          return DataFiles.builder(spec)
              .withPath(stat.getPath().toString())
              .withFormat("avro")
              .withFileSizeInBytes(stat.getLen())
              .withMetrics(metrics)
              .withPartitionPath(partitionKey)
              .build();

        }).collect(Collectors.toList());
  } catch (IOException e) {
    throw SparkExceptionUtil.toUncheckedException(e, "Unable to list files in partition: %s", partitionUri);
  }
}

org.apache.iceberg.DataFile Java Examples