org.apache.iceberg.io.FileIO Java Examples
The following examples show how to use
org.apache.iceberg.io.FileIO.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: SparkTableUtil.java From iceberg with Apache License 2.0 | 6 votes |
private static Iterator<ManifestFile> buildManifest(SerializableConfiguration conf, PartitionSpec spec, String basePath, Iterator<Tuple2<String, DataFile>> fileTuples) { if (fileTuples.hasNext()) { FileIO io = new HadoopFileIO(conf.get()); TaskContext ctx = TaskContext.get(); String suffix = String.format("stage-%d-task-%d-manifest", ctx.stageId(), ctx.taskAttemptId()); Path location = new Path(basePath, suffix); String outputPath = FileFormat.AVRO.addExtension(location.toString()); OutputFile outputFile = io.newOutputFile(outputPath); ManifestWriter<DataFile> writer = ManifestFiles.write(spec, outputFile); try (ManifestWriter<DataFile> writerRef = writer) { fileTuples.forEachRemaining(fileTuple -> writerRef.add(fileTuple._2)); } catch (IOException e) { throw SparkExceptionUtil.toUncheckedException(e, "Unable to close the manifest writer: %s", outputPath); } ManifestFile manifestFile = writer.toManifestFile(); return ImmutableList.of(manifestFile).iterator(); } else { return Collections.emptyIterator(); } }
Example #2
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 6 votes |
@Override public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) { Configuration conf = new Configuration(lazyBaseConf()); Table table = getTableAndResolveHadoopConfiguration(options, conf); String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive"); Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption()); Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options); if (readSchema != null) { // convert() will fail if readSchema contains fields not in table.schema() SparkSchemaUtil.convert(table.schema(), readSchema); reader.pruneColumns(readSchema); } return reader; }
Example #3
Source File: SparkBatchWrite.java From iceberg with Apache License 2.0 | 6 votes |
SparkBatchWrite(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager, CaseInsensitiveStringMap options, boolean overwriteDynamic, boolean overwriteByFilter, Expression overwriteExpr, String applicationId, String wapId, Schema writeSchema, StructType dsSchema) { this.table = table; this.format = getFileFormat(table.properties(), options); this.io = io; this.encryptionManager = encryptionManager; this.overwriteDynamic = overwriteDynamic; this.overwriteByFilter = overwriteByFilter; this.overwriteExpr = overwriteExpr; this.applicationId = applicationId; this.wapId = wapId; this.genieId = options.get("genie-id"); this.writeSchema = writeSchema; this.dsSchema = dsSchema; long tableTargetFileSize = PropertyUtil.propertyAsLong( table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize); }
Example #4
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
private List<ManifestFile> writeManifestsForPartitionedTable( Dataset<Row> manifestEntryDF, int numManifests, int targetNumManifestEntries) { Broadcast<FileIO> io = sparkContext.broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); // we allow the actual size of manifests to be 10% higher if the estimation is not precise enough long maxNumManifestEntries = (long) (1.1 * targetNumManifestEntries); return withReusableDS(manifestEntryDF, df -> { Column partitionColumn = df.col("data_file.partition"); return df.repartitionByRange(numManifests, partitionColumn) .sortWithinPartitions(partitionColumn) .mapPartitions( toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), manifestEncoder ) .collectAsList(); }); }
Example #5
Source File: BaseSnapshot.java From iceberg with Apache License 2.0 | 6 votes |
BaseSnapshot(FileIO io, long sequenceNumber, long snapshotId, Long parentId, long timestampMillis, String operation, Map<String, String> summary, String manifestList) { this.io = io; this.sequenceNumber = sequenceNumber; this.snapshotId = snapshotId; this.parentId = parentId; this.timestampMillis = timestampMillis; this.operation = operation; this.summary = summary; this.manifestListLocation = manifestList; }
Example #6
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 6 votes |
@Override public Optional<DataSourceWriter> createWriter(String jobId, StructType dsStruct, SaveMode mode, DataSourceOptions options) { Preconditions.checkArgument(mode == SaveMode.Append || mode == SaveMode.Overwrite, "Save mode %s is not supported", mode); Configuration conf = new Configuration(lazyBaseConf()); Table table = getTableAndResolveHadoopConfiguration(options, conf); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options)); SparkUtil.validatePartitionTransforms(table.spec()); String appId = lazySparkSession().sparkContext().applicationId(); String wapId = lazySparkSession().conf().get("spark.wap.id", null); boolean replacePartitions = mode == SaveMode.Overwrite; Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption()); return Optional.of(new Writer( table, io, encryptionManager, options, replacePartitions, appId, wapId, writeSchema, dsStruct)); }
Example #7
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
private static MapPartitionsFunction<Row, ManifestFile> toManifests( Broadcast<FileIO> io, long maxNumManifestEntries, String location, int format, PartitionSpec spec, StructType sparkType) { return (MapPartitionsFunction<Row, ManifestFile>) rows -> { List<Row> rowsAsList = Lists.newArrayList(rows); if (rowsAsList.isEmpty()) { return Collections.emptyIterator(); } List<ManifestFile> manifests = Lists.newArrayList(); if (rowsAsList.size() <= maxNumManifestEntries) { manifests.add(writeManifest(rowsAsList, 0, rowsAsList.size(), io, location, format, spec, sparkType)); } else { int midIndex = rowsAsList.size() / 2; manifests.add(writeManifest(rowsAsList, 0, midIndex, io, location, format, spec, sparkType)); manifests.add(writeManifest(rowsAsList, midIndex, rowsAsList.size(), io, location, format, spec, sparkType)); } return manifests.iterator(); }; }
Example #8
Source File: SparkBatchScan.java From iceberg with Apache License 2.0 | 6 votes |
ReadTask(CombinedScanTask task, String tableSchemaString, String expectedSchemaString, String nameMappingString, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager, boolean caseSensitive, boolean localityPreferred) { this.task = task; this.tableSchemaString = tableSchemaString; this.expectedSchemaString = expectedSchemaString; this.nameMappingString = nameMappingString; this.io = io; this.encryptionManager = encryptionManager; this.caseSensitive = caseSensitive; if (localityPreferred) { this.preferredLocations = Util.blockLocations(io.value(), task); } else { this.preferredLocations = HadoopInputFile.NO_LOCATION_PREFERENCE; } }
Example #9
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 6 votes |
@Override public StreamWriter createStreamWriter(String runId, StructType dsStruct, OutputMode mode, DataSourceOptions options) { Preconditions.checkArgument( mode == OutputMode.Append() || mode == OutputMode.Complete(), "Output mode %s is not supported", mode); Configuration conf = new Configuration(lazyBaseConf()); Table table = getTableAndResolveHadoopConfiguration(options, conf); Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsStruct); TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(options), checkOrdering(options)); SparkUtil.validatePartitionTransforms(table.spec()); // Spark 2.4.x passes runId to createStreamWriter instead of real queryId, // so we fetch it directly from sparkContext to make writes idempotent String queryId = lazySparkSession().sparkContext().getLocalProperty(StreamExecution.QUERY_ID_KEY()); String appId = lazySparkSession().sparkContext().applicationId(); Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption()); return new StreamingWriter(table, io, encryptionManager, options, queryId, mode, appId, writeSchema, dsStruct); }
Example #10
Source File: SparkWriteBuilder.java From iceberg with Apache License 2.0 | 6 votes |
@Override public BatchWrite buildForBatch() { // Validate Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema); TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(spark, options), checkOrdering(spark, options)); SparkUtil.validatePartitionTransforms(table.spec()); // Get application id String appId = spark.sparkContext().applicationId(); // Get write-audit-publish id String wapId = spark.conf().get("spark.wap.id", null); Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption()); return new SparkBatchWrite( table, io, encryptionManager, options, overwriteDynamic, overwriteByFilter, overwriteExpr, appId, wapId, writeSchema, dsSchema); }
Example #11
Source File: SparkWriteBuilder.java From iceberg with Apache License 2.0 | 6 votes |
@Override public StreamingWrite buildForStreaming() { // Validate Schema writeSchema = SparkSchemaUtil.convert(table.schema(), dsSchema); TypeUtil.validateWriteSchema(table.schema(), writeSchema, checkNullability(spark, options), checkOrdering(spark, options)); SparkUtil.validatePartitionTransforms(table.spec()); // Change to streaming write if it is just append Preconditions.checkState(!overwriteDynamic, "Unsupported streaming operation: dynamic partition overwrite"); Preconditions.checkState(!overwriteByFilter || overwriteExpr == Expressions.alwaysTrue(), "Unsupported streaming operation: overwrite by filter: %s", overwriteExpr); // Get application id String appId = spark.sparkContext().applicationId(); // Get write-audit-publish id String wapId = spark.conf().get("spark.wap.id", null); Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption()); return new SparkStreamingWrite( table, io, encryptionManager, options, overwriteByFilter, writeQueryId, appId, wapId, writeSchema, dsSchema); }
Example #12
Source File: Writer.java From iceberg with Apache License 2.0 | 6 votes |
Writer(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager, DataSourceOptions options, boolean replacePartitions, String applicationId, String wapId, Schema writeSchema, StructType dsSchema) { this.table = table; this.format = getFileFormat(table.properties(), options); this.io = io; this.encryptionManager = encryptionManager; this.replacePartitions = replacePartitions; this.applicationId = applicationId; this.wapId = wapId; this.writeSchema = writeSchema; this.dsSchema = dsSchema; long tableTargetFileSize = PropertyUtil.propertyAsLong( table.properties(), WRITE_TARGET_FILE_SIZE_BYTES, WRITE_TARGET_FILE_SIZE_BYTES_DEFAULT); this.targetFileSize = options.getLong("target-file-size-bytes", tableTargetFileSize); }
Example #13
Source File: RewriteManifestsAction.java From iceberg with Apache License 2.0 | 6 votes |
private List<ManifestFile> writeManifestsForUnpartitionedTable(Dataset<Row> manifestEntryDF, int numManifests) { Broadcast<FileIO> io = sparkContext.broadcast(fileIO); StructType sparkType = (StructType) manifestEntryDF.schema().apply("data_file").dataType(); // we rely only on the target number of manifests for unpartitioned tables // as we should not worry about having too much metadata per partition long maxNumManifestEntries = Long.MAX_VALUE; return manifestEntryDF .repartition(numManifests) .mapPartitions( toManifests(io, maxNumManifestEntries, stagingLocation, formatVersion, spec, sparkType), manifestEncoder ) .collectAsList(); }
Example #14
Source File: RowDataRewriter.java From iceberg with Apache License 2.0 | 6 votes |
public RowDataRewriter(Table table, PartitionSpec spec, boolean caseSensitive, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager) { this.schema = table.schema(); this.spec = spec; this.locations = table.locationProvider(); this.properties = table.properties(); this.io = io; this.encryptionManager = encryptionManager; this.caseSensitive = caseSensitive; this.nameMapping = table.properties().get(DEFAULT_NAME_MAPPING); String formatString = table.properties().getOrDefault( TableProperties.DEFAULT_FILE_FORMAT, TableProperties.DEFAULT_FILE_FORMAT_DEFAULT); this.format = FileFormat.valueOf(formatString.toUpperCase(Locale.ENGLISH)); }
Example #15
Source File: DataFilesTable.java From iceberg with Apache License 2.0 | 5 votes |
ManifestReadTask(FileIO io, ManifestFile manifest, Schema schema, String schemaString, String specString, ResidualEvaluator residuals) { super(DataFiles.fromManifest(manifest), schemaString, specString, residuals); this.io = io; this.manifest = manifest; this.schema = schema; }
Example #16
Source File: SparkScanBuilder.java From iceberg with Apache License 2.0 | 5 votes |
@Override public Scan build() { Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryption = lazySparkContext().broadcast(table.encryption()); return new SparkBatchScan(table, io, encryption, caseSensitive, lazySchema(), filterExpressions, options); }
Example #17
Source File: ManifestEntriesTable.java From iceberg with Apache License 2.0 | 5 votes |
ManifestReadTask(FileIO io, ManifestFile manifest, Schema fileSchema, String schemaString, String specString, ResidualEvaluator residuals) { super(DataFiles.fromManifest(manifest), schemaString, specString, residuals); this.fileSchema = fileSchema; this.io = io; this.manifest = manifest; }
Example #18
Source File: SparkBatchScan.java From iceberg with Apache License 2.0 | 5 votes |
SparkBatchScan(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryption, boolean caseSensitive, Schema expectedSchema, List<Expression> filters, CaseInsensitiveStringMap options) { this.table = table; this.io = io; this.encryptionManager = encryption; this.caseSensitive = caseSensitive; this.expectedSchema = expectedSchema; this.filterExpressions = filters; this.snapshotId = Spark3Util.propertyAsLong(options, "snapshot-id", null); this.asOfTimestamp = Spark3Util.propertyAsLong(options, "as-of-timestamp", null); if (snapshotId != null && asOfTimestamp != null) { throw new IllegalArgumentException( "Cannot scan using both snapshot-id and as-of-timestamp to select the table snapshot"); } this.startSnapshotId = Spark3Util.propertyAsLong(options, "start-snapshot-id", null); this.endSnapshotId = Spark3Util.propertyAsLong(options, "end-snapshot-id", null); if (snapshotId != null || asOfTimestamp != null) { if (startSnapshotId != null || endSnapshotId != null) { throw new IllegalArgumentException( "Cannot specify start-snapshot-id and end-snapshot-id to do incremental scan when either snapshot-id or " + "as-of-timestamp is specified"); } } else if (startSnapshotId == null && endSnapshotId != null) { throw new IllegalArgumentException("Cannot only specify option end-snapshot-id to do incremental scan"); } // look for split behavior overrides in options this.splitSize = Spark3Util.propertyAsLong(options, "split-size", null); this.splitLookback = Spark3Util.propertyAsInt(options, "lookback", null); this.splitOpenFileCost = Spark3Util.propertyAsLong(options, "file-open-cost", null); this.localityPreferred = Spark3Util.isLocalityEnabled(io.value(), table.location(), options); this.batchReadsEnabled = Spark3Util.isVectorizationEnabled(table.properties(), options); this.batchSize = Spark3Util.batchSize(table.properties(), options); }
Example #19
Source File: SparkBatchWrite.java From iceberg with Apache License 2.0 | 5 votes |
protected WriterFactory(PartitionSpec spec, FileFormat format, LocationProvider locations, Map<String, String> properties, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager, long targetFileSize, Schema writeSchema, StructType dsSchema) { this.spec = spec; this.format = format; this.locations = locations; this.properties = properties; this.io = io; this.encryptionManager = encryptionManager; this.targetFileSize = targetFileSize; this.writeSchema = writeSchema; this.dsSchema = dsSchema; }
Example #20
Source File: BaseSnapshot.java From iceberg with Apache License 2.0 | 5 votes |
/** * For testing only. */ BaseSnapshot(FileIO io, long snapshotId, String... manifestFiles) { this(io, snapshotId, null, System.currentTimeMillis(), null, null, Lists.transform(Arrays.asList(manifestFiles), path -> new GenericManifestFile(io.newInputFile(path), 0))); }
Example #21
Source File: Spark3Util.java From iceberg with Apache License 2.0 | 5 votes |
public static boolean isLocalityEnabled(FileIO io, String location, CaseInsensitiveStringMap readOptions) { InputFile in = io.newInputFile(location); if (in instanceof HadoopInputFile) { String scheme = ((HadoopInputFile) in).getFileSystem().getScheme(); return readOptions.getBoolean("locality", LOCALITY_WHITELIST_FS.contains(scheme)); } return false; }
Example #22
Source File: BaseDataReader.java From iceberg with Apache License 2.0 | 5 votes |
BaseDataReader(CombinedScanTask task, FileIO fileIo, EncryptionManager encryptionManager) { this.fileIo = fileIo; this.tasks = task.files().iterator(); Iterable<InputFile> decryptedFiles = encryptionManager.decrypt(Iterables.transform( task.files(), fileScanTask -> EncryptedFiles.encryptedInput( this.fileIo.newInputFile(fileScanTask.file().path().toString()), fileScanTask.file().keyMetadata()))); ImmutableMap.Builder<String, InputFile> inputFileBuilder = ImmutableMap.builder(); decryptedFiles.forEach(decrypted -> inputFileBuilder.put(decrypted.location(), decrypted)); this.inputFiles = inputFileBuilder.build(); this.currentIterator = CloseableIterator.empty(); }
Example #23
Source File: Util.java From iceberg with Apache License 2.0 | 5 votes |
public static String[] blockLocations(FileIO io, CombinedScanTask task) { Set<String> locations = Sets.newHashSet(); for (FileScanTask f : task.files()) { InputFile in = io.newInputFile(f.file().path().toString()); if (in instanceof HadoopInputFile) { Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length())); } } return locations.toArray(HadoopInputFile.NO_LOCATION_PREFERENCE); }
Example #24
Source File: HadoopTableOperations.java From iceberg with Apache License 2.0 | 5 votes |
@Override public FileIO io() { if (defaultFileIo == null) { defaultFileIo = new HadoopFileIO(conf); } return defaultFileIo; }
Example #25
Source File: SparkStreamingWrite.java From iceberg with Apache License 2.0 | 5 votes |
SparkStreamingWrite(Table table, Broadcast<FileIO> io, Broadcast<EncryptionManager> encryptionManager, CaseInsensitiveStringMap options, boolean truncateBatches, String queryId, String applicationId, String wapId, Schema writeSchema, StructType dsSchema) { super( table, io, encryptionManager, options, false, truncateBatches, Expressions.alwaysTrue(), applicationId, wapId, writeSchema, dsSchema); this.truncateBatches = truncateBatches; this.queryId = queryId; }
Example #26
Source File: BaseSnapshot.java From iceberg with Apache License 2.0 | 5 votes |
BaseSnapshot(FileIO io, long snapshotId, Long parentId, long timestampMillis, String operation, Map<String, String> summary, List<ManifestFile> dataManifests) { this(io, INITIAL_SEQUENCE_NUMBER, snapshotId, parentId, timestampMillis, operation, summary, null); this.allManifests = dataManifests; }
Example #27
Source File: SparkUtil.java From iceberg with Apache License 2.0 | 5 votes |
public static FileIO serializableFileIO(Table table) { if (table.io() instanceof HadoopFileIO) { // we need to use Spark's SerializableConfiguration to avoid issues with Kryo serialization SerializableConfiguration conf = new SerializableConfiguration(((HadoopFileIO) table.io()).conf()); return new HadoopFileIO(conf::value); } else { return table.io(); } }
Example #28
Source File: BaseWriter.java From iceberg with Apache License 2.0 | 5 votes |
BaseWriter(PartitionSpec spec, FileFormat format, SparkAppenderFactory appenderFactory, OutputFileFactory fileFactory, FileIO io, long targetFileSize) { this.spec = spec; this.format = format; this.appenderFactory = appenderFactory; this.fileFactory = fileFactory; this.io = io; this.targetFileSize = targetFileSize; }
Example #29
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 5 votes |
RowDataReader( CombinedScanTask task, Schema tableSchema, Schema expectedSchema, String nameMapping, FileIO fileIo, EncryptionManager encryptionManager, boolean caseSensitive) { super(task, fileIo, encryptionManager); this.tableSchema = tableSchema; this.expectedSchema = expectedSchema; this.nameMapping = nameMapping; this.caseSensitive = caseSensitive; }
Example #30
Source File: HiveTableOperations.java From presto with Apache License 2.0 | 5 votes |
private HiveTableOperations(FileIO fileIo, HiveMetastore metastore, HiveIdentity identity, String database, String table, Optional<String> owner, Optional<String> location) { this.fileIo = requireNonNull(fileIo, "fileIo is null"); this.metastore = requireNonNull(metastore, "metastore is null"); this.identity = requireNonNull(identity, "identity is null"); this.database = requireNonNull(database, "database is null"); this.tableName = requireNonNull(table, "table is null"); this.owner = requireNonNull(owner, "owner is null"); this.location = requireNonNull(location, "location is null"); }