org.apache.iceberg.io.FileIO Java Exaples

Source File: SparkTableUtil.java From iceberg with Apache License 2.0

6 votes

private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec,
                                                    String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) {
  if (fileTuples.hasNext()) {
    FileIO io = new HadoopFileIO(conf.get());
    TaskContext ctx = TaskContext.get();
    String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId());
    Path location = new Path(basePath, suffix);
    String outputPath = FileFormat.AVRO.addExtension(location.toString());
    OutputFile outputFile = io.newOutputFile(outputPath);
    ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile);

    try (ManifestWriter<DataFile> writerRef = writer) {
      fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2));
    } catch (IOException e) {
      throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath);
    }

    ManifestFile manifestFile = writer.toManifestFile();
    return ImmutableList.of(manifestFile).iterator();
  } else {
    return Collections.emptyIterator();
  }
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
  if (readSchema != null) {
    // convert() will fail if readSchema contains fields not in table.schema()
    SparkSchemaUtil.convert(table.schema(), readSchema);
    reader.pruneColumns(readSchema);
  }

  return reader;
}

Source File: SparkBatchWrite.java From iceberg with Apache License 2.0

6 votes

SparkBatchWrite(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
                CaseInsensitiveStringMap options, boolean overwriteDynamic, boolean overwriteByFilter,
                Expression overwriteExpr, String applicationId, String wapId, Schema writeSchema,
                StructType dsSchema) {
  this.table = table;
  this.format = getFileFormat(table.properties(), options);
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.overwriteDynamic = overwriteDynamic;
  this.overwriteByFilter = overwriteByFilter;
  this.overwriteExpr = overwriteExpr;
  this.applicationId = applicationId;
  this.wapId = wapId;
  this.genieId = options.get("genie-id");
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;

  long tableTargetFileSize = PropertyUtil.propertyAsLong(
      table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize);
}

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private List<ManifestFile> writeManifestsForPartitionedTable(
    Dataset<Row> manifestEntryDF, int numManifests,
    int targetNumManifestEntries) {

  Broadcast<FileIO> io = sparkContext.broadcast(fileIO);
  StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();

  // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough
  long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries);

  return withReusableDS(manifestEntryDF, df -> {
    Column partitionColumn = df.col("data_file.partition");
    return df.repartitionByRange(numManifests, partitionColumn)
        .sortWithinPartitions(partitionColumn)
        .mapPartitions(
            toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType),
            manifestEncoder
        )
        .collectAsList();
  });
}

Source File: BaseSnapshot.java From iceberg with Apache License 2.0

6 votes

BaseSnapshot(FileIO io,
             long sequenceNumber,
             long snapshotId,
             Long parentId,
             long timestampMillis,
             String operation,
             Map<String, String> summary,
             String manifestList) {
  this.io = io;
  this.sequenceNumber = sequenceNumber;
  this.snapshotId = snapshotId;
  this.parentId = parentId;
  this.timestampMillis = timestampMillis;
  this.operation = operation;
  this.summary = summary;
  this.manifestListLocation = manifestList;
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode,
                                               DataSourceOptions options) {
  Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite,
      "Save mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  String appId = lazySparkSession().sparkContext().applicationId();
  String wapId = lazySparkSession().conf().get("spark.wap.id", null);
  boolean replacePartitions = mode == SaveMode.Overwrite;

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return Optional.of(new Writer(
      table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct));
}

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private static MapPartitionsFunction<Row, ManifestFile> toManifests(
    Broadcast<FileIO> io, long maxNumManifestEntries, String location,
    int format, PartitionSpec spec, StructType sparkType) {

  return (MapPartitionsFunction<Row, ManifestFile>) rows -> {
    List<Row> rowsAsList = Lists.newArrayList(rows);

    if (rowsAsList.isEmpty()) {
      return Collections.emptyIterator();
    }

    List<ManifestFile> manifests = Lists.newArrayList();
    if (rowsAsList.size() <= maxNumManifestEntries) {
      manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType));
    } else {
      int midIndex = rowsAsList.size() / 2;
      manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType));
      manifests.add(writeManifest(rowsAsList,  midIndex, rowsAsList.size(), io, location, format, spec, sparkType));
    }

    return manifests.iterator();
  };
}

Source File: SparkBatchScan.java From iceberg with Apache License 2.0

6 votes

ReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString, String nameMappingString,
         Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager, boolean caseSensitive,
         boolean localityPreferred) {
  this.task = task;
  this.tableSchemaString = tableSchemaString;
  this.expectedSchemaString = expectedSchemaString;
  this.nameMappingString = nameMappingString;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.caseSensitive = caseSensitive;
  if (localityPreferred) {
    this.preferredLocations = Util.blockLocations(io.value(), task);
  } else {
    this.preferredLocations = HadoopInputFile.NO_LOCATION_PREFERENCE;
  }
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public StreamWriter createStreamWriter(String runId, StructType dsStruct,
                                       OutputMode mode, DataSourceOptions options) {
  Preconditions.checkArgument(
      mode == OutputMode.Append() || mode == OutputMode.Complete(),
      "Output mode %s is not supported", mode);
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options));
  SparkUtil.validatePartitionTransforms(table.spec());
  // Spark 2.4.x passes runId to createStreamWriter instead of real queryId,
  // so we fetch it directly from sparkContext to make writes idempotent
  String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY());
  String appId = lazySparkSession().sparkContext().applicationId();

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct);
}

Source File: SparkWriteBuilder.java From iceberg with Apache License 2.0

6 votes

@Override
public BatchWrite buildForBatch() {
  // Validate
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema,
      checkNullability(spark, options), checkOrdering(spark, options));
  SparkUtil.validatePartitionTransforms(table.spec());

  // Get application id
  String appId = spark.sparkContext().applicationId();

  // Get write-audit-publish id
  String wapId = spark.conf().get("spark.wap.id", null);

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new SparkBatchWrite(
      table, io, encryptionManager, options, overwriteDynamic, overwriteByFilter, overwriteExpr, appId, wapId,
      writeSchema, dsSchema);
}

Source File: SparkWriteBuilder.java From iceberg with Apache License 2.0

6 votes

@Override
public StreamingWrite buildForStreaming() {
  // Validate
  Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema);
  TypeUtil.validateWriteSchema(table.schema(), writeSchema,
      checkNullability(spark, options), checkOrdering(spark, options));
  SparkUtil.validatePartitionTransforms(table.spec());

  // Change to streaming write if it is just append
  Preconditions.checkState(!overwriteDynamic,
      "Unsupported streaming operation: dynamic partition overwrite");
  Preconditions.checkState(!overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(),
      "Unsupported streaming operation: overwrite by filter: %s", overwriteExpr);

  // Get application id
  String appId = spark.sparkContext().applicationId();

  // Get write-audit-publish id
  String wapId = spark.conf().get("spark.wap.id", null);

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  return new SparkStreamingWrite(
      table, io, encryptionManager, options, overwriteByFilter, writeQueryId, appId, wapId, writeSchema, dsSchema);
}

Source File: Writer.java From iceberg with Apache License 2.0

6 votes

Writer(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
       DataSourceOptions options, boolean replacePartitions, String applicationId, String wapId,
       Schema writeSchema, StructType dsSchema) {
  this.table = table;
  this.format = getFileFormat(table.properties(), options);
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.replacePartitions = replacePartitions;
  this.applicationId = applicationId;
  this.wapId = wapId;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;

  long tableTargetFileSize = PropertyUtil.propertyAsLong(
      table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT);
  this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize);
}

Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0

6 votes

private List<ManifestFile> writeManifestsForUnpartitionedTable(Dataset<Row> manifestEntryDF, int numManifests) {
  Broadcast<FileIO> io = sparkContext.broadcast(fileIO);
  StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType();

  // we rely only on the target number of manifests for unpartitioned tables
  // as we should not worry about having too much metadata per partition
  long maxNumManifestEntries = Long.MAX_VALUE;

  return manifestEntryDF
      .repartition(numManifests)
      .mapPartitions(
          toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType),
          manifestEncoder
      )
      .collectAsList();
}

Source File: RowDataRewriter.java From iceberg with Apache License 2.0

6 votes

public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive,
                       Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) {
  this.schema = table.schema();
  this.spec = spec;
  this.locations = table.locationProvider();
  this.properties = table.properties();
  this.io = io;
  this.encryptionManager = encryptionManager;

  this.caseSensitive = caseSensitive;
  this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING);

  String formatString = table.properties().getOrDefault(
      TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT);
  this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH));
}

Source File: DataFilesTable.java From iceberg with Apache License 2.0

5 votes

ManifestReadTask(FileIO io, ManifestFile manifest, Schema schema, String schemaString,
                 String specString, ResidualEvaluator residuals) {
  super(DataFiles.fromManifest(manifest), schemaString, specString, residuals);
  this.io = io;
  this.manifest = manifest;
  this.schema = schema;
}

Source File: SparkScanBuilder.java From iceberg with Apache License 2.0

5 votes

@Override
public Scan build() {
  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryption = lazySparkContext().broadcast(table.encryption());

  return new SparkBatchScan(table, io, encryption, caseSensitive, lazySchema(), filterExpressions, options);
}

Source File: ManifestEntriesTable.java From iceberg with Apache License 2.0

5 votes

ManifestReadTask(FileIO io, ManifestFile manifest, Schema fileSchema, String schemaString,
                 String specString, ResidualEvaluator residuals) {
  super(DataFiles.fromManifest(manifest), schemaString, specString, residuals);
  this.fileSchema = fileSchema;
  this.io = io;
  this.manifest = manifest;
}

Source File: SparkBatchScan.java From iceberg with Apache License 2.0

5 votes

SparkBatchScan(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryption, boolean caseSensitive,
               Schema expectedSchema, List<Expression> filters, CaseInsensitiveStringMap options) {
  this.table = table;
  this.io = io;
  this.encryptionManager = encryption;
  this.caseSensitive = caseSensitive;
  this.expectedSchema = expectedSchema;
  this.filterExpressions = filters;
  this.snapshotId = Spark3Util.propertyAsLong(options, "snapshot-id", null);
  this.asOfTimestamp = Spark3Util.propertyAsLong(options, "as-of-timestamp", null);

  if (snapshotId != null && asOfTimestamp != null) {
    throw new IllegalArgumentException(
        "Cannot scan using both snapshot-id and as-of-timestamp to select the table snapshot");
  }

  this.startSnapshotId = Spark3Util.propertyAsLong(options, "start-snapshot-id", null);
  this.endSnapshotId = Spark3Util.propertyAsLong(options, "end-snapshot-id", null);
  if (snapshotId != null || asOfTimestamp != null) {
    if (startSnapshotId != null || endSnapshotId != null) {
      throw new IllegalArgumentException(
          "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either snapshot-id or " +
              "as-of-timestamp is specified");
    }
  } else if (startSnapshotId == null && endSnapshotId != null) {
    throw new IllegalArgumentException("Cannot only specify option end-snapshot-id to do incremental scan");
  }

  // look for split behavior overrides in options
  this.splitSize = Spark3Util.propertyAsLong(options, "split-size", null);
  this.splitLookback = Spark3Util.propertyAsInt(options, "lookback", null);
  this.splitOpenFileCost = Spark3Util.propertyAsLong(options, "file-open-cost", null);
  this.localityPreferred = Spark3Util.isLocalityEnabled(io.value(), table.location(), options);
  this.batchReadsEnabled = Spark3Util.isVectorizationEnabled(table.properties(), options);
  this.batchSize = Spark3Util.batchSize(table.properties(), options);
}

Source File: SparkBatchWrite.java From iceberg with Apache License 2.0

5 votes

protected WriterFactory(PartitionSpec spec, FileFormat format, LocationProvider locations,
                        Map<String, String> properties, Broadcast<FileIO> io,
                        Broadcast<EncryptionManager> encryptionManager, long targetFileSize,
                        Schema writeSchema, StructType dsSchema) {
  this.spec = spec;
  this.format = format;
  this.locations = locations;
  this.properties = properties;
  this.io = io;
  this.encryptionManager = encryptionManager;
  this.targetFileSize = targetFileSize;
  this.writeSchema = writeSchema;
  this.dsSchema = dsSchema;
}

Source File: BaseSnapshot.java From iceberg with Apache License 2.0

5 votes

/**
 * For testing only.
 */
BaseSnapshot(FileIO io,
             long snapshotId,
             String... manifestFiles) {
  this(io, snapshotId, null, System.currentTimeMillis(), null, null,
      Lists.transform(Arrays.asList(manifestFiles),
          path -> new GenericManifestFile(io.newInputFile(path), 0)));
}

Source File: Spark3Util.java From iceberg with Apache License 2.0

5 votes

public static boolean isLocalityEnabled(FileIO io, String location, CaseInsensitiveStringMap readOptions) {
  InputFile in = io.newInputFile(location);
  if (in instanceof HadoopInputFile) {
    String scheme = ((HadoopInputFile) in).getFileSystem().getScheme();
    return readOptions.getBoolean("locality", LOCALITY_WHITELIST_FS.contains(scheme));
  }
  return false;
}

Source File: BaseDataReader.java From iceberg with Apache License 2.0

5 votes

BaseDataReader(CombinedScanTask task, FileIO fileIo, EncryptionManager encryptionManager) {
  this.fileIo = fileIo;
  this.tasks = task.files().iterator();
  Iterable<InputFile> decryptedFiles = encryptionManager.decrypt(Iterables.transform(
      task.files(),
      fileScanTask ->
          EncryptedFiles.encryptedInput(
              this.fileIo.newInputFile(fileScanTask.file().path().toString()),
              fileScanTask.file().keyMetadata())));
  ImmutableMap.Builder<String, InputFile> inputFileBuilder = ImmutableMap.builder();
  decryptedFiles.forEach(decrypted -> inputFileBuilder.put(decrypted.location(), decrypted));
  this.inputFiles = inputFileBuilder.build();
  this.currentIterator = CloseableIterator.empty();
}

Source File: Util.java From iceberg with Apache License 2.0

5 votes

public static String[] blockLocations(FileIO io, CombinedScanTask task) {
  Set<String> locations = Sets.newHashSet();
  for (FileScanTask f : task.files()) {
    InputFile in = io.newInputFile(f.file().path().toString());
    if (in instanceof HadoopInputFile) {
      Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length()));
    }
  }

  return locations.toArray(HadoopInputFile.NO_LOCATION_PREFERENCE);
}

Source File: HadoopTableOperations.java From iceberg with Apache License 2.0

5 votes

@Override
public FileIO io() {
  if (defaultFileIo == null) {
    defaultFileIo = new HadoopFileIO(conf);
  }
  return defaultFileIo;
}

Source File: SparkStreamingWrite.java From iceberg with Apache License 2.0

5 votes

SparkStreamingWrite(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager,
                    CaseInsensitiveStringMap options, boolean truncateBatches, String queryId,
                    String applicationId, String wapId, Schema writeSchema, StructType dsSchema) {
  super(
      table, io, encryptionManager, options, false, truncateBatches, Expressions.alwaysTrue(), applicationId, wapId,
      writeSchema, dsSchema);
  this.truncateBatches = truncateBatches;
  this.queryId = queryId;
}

Source File: BaseSnapshot.java From iceberg with Apache License 2.0

5 votes

BaseSnapshot(FileIO io,
             long snapshotId,
             Long parentId,
             long timestampMillis,
             String operation,
             Map<String, String> summary,
             List<ManifestFile> dataManifests) {
  this(io, INITIAL_SEQUENCE_NUMBER, snapshotId, parentId, timestampMillis, operation, summary, null);
  this.allManifests = dataManifests;
}

Source File: SparkUtil.java From iceberg with Apache License 2.0

5 votes

public static FileIO serializableFileIO(Table table) {
  if (table.io() instanceof HadoopFileIO) {
    // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization
    SerializableConfiguration conf = new SerializableConfiguration(((HadoopFileIO) table.io()).conf());
    return new HadoopFileIO(conf::value);
  } else {
    return table.io();
  }
}

Source File: BaseWriter.java From iceberg with Apache License 2.0

5 votes

BaseWriter(PartitionSpec spec, FileFormat format, SparkAppenderFactory appenderFactory,
           OutputFileFactory fileFactory, FileIO io, long targetFileSize) {
  this.spec = spec;
  this.format = format;
  this.appenderFactory = appenderFactory;
  this.fileFactory = fileFactory;
  this.io = io;
  this.targetFileSize = targetFileSize;
}

Source File: RowDataReader.java From iceberg with Apache License 2.0

5 votes

RowDataReader(
    CombinedScanTask task, Schema tableSchema, Schema expectedSchema, String nameMapping, FileIO fileIo,
    EncryptionManager encryptionManager, boolean caseSensitive) {
  super(task, fileIo, encryptionManager);
  this.tableSchema = tableSchema;
  this.expectedSchema = expectedSchema;
  this.nameMapping = nameMapping;
  this.caseSensitive = caseSensitive;
}

Source File: HiveTableOperations.java From presto with Apache License 2.0

5 votes

private HiveTableOperations(FileIO fileIo, HiveMetastore metastore, HiveIdentity identity, String database, String table, Optional<String> owner, Optional<String> location)
{
    this.fileIo = requireNonNull(fileIo, "fileIo is null");
    this.metastore = requireNonNull(metastore, "metastore is null");
    this.identity = requireNonNull(identity, "identity is null");
    this.database = requireNonNull(database, "database is null");
    this.tableName = requireNonNull(table, "table is null");
    this.owner = requireNonNull(owner, "owner is null");
    this.location = requireNonNull(location, "location is null");
}

org.apache.iceberg.io.FileIO Java Examples