org.apache.iceberg.io.InputFile Java Examples
The following examples show how to use
org.apache.iceberg.io.InputFile.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 7 votes |
private CloseableIterable<InternalRow> newAvroIterable( InputFile location, FileScanTask task, Schema projection, Map<Integer, ?> idToConstant) { Avro.ReadBuilder builder = Avro.read(location) .reuseContainers() .project(projection) .split(task.start(), task.length()) .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } return builder.build(); }
Example #2
Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0 | 6 votes |
private org.apache.parquet.io.InputFile parquetInputFile(InputFile inFile) { return new org.apache.parquet.io.InputFile() { @Override public long getLength() throws IOException { return inFile.getLength(); } @Override public org.apache.parquet.io.SeekableInputStream newStream() throws IOException { SeekableInputStream stream = inFile.newStream(); return new DelegatingSeekableInputStream(stream) { @Override public long getPos() throws IOException { return stream.getPos(); } @Override public void seek(long newPos) throws IOException { stream.seek(newPos); } }; } }; }
Example #3
Source File: TableMetadata.java From iceberg with Apache License 2.0 | 6 votes |
private List<MetadataLogEntry> addPreviousFile(InputFile previousFile, long timestampMillis, Map<String, String> updatedProperties) { if (previousFile == null) { return previousFiles; } int maxSize = Math.max(1, PropertyUtil.propertyAsInt(updatedProperties, TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, TableProperties.METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT)); List<MetadataLogEntry> newMetadataLog; if (previousFiles.size() >= maxSize) { int removeIndex = previousFiles.size() - maxSize + 1; newMetadataLog = Lists.newArrayList(previousFiles.subList(removeIndex, previousFiles.size())); } else { newMetadataLog = Lists.newArrayList(previousFiles); } newMetadataLog.add(new MetadataLogEntry(timestampMillis, previousFile.location())); return newMetadataLog; }
Example #4
Source File: GenericManifestFile.java From iceberg with Apache License 2.0 | 6 votes |
GenericManifestFile(InputFile file, int specId) { this.avroSchema = AVRO_SCHEMA; this.file = file; this.manifestPath = file.location(); this.length = null; // lazily loaded from file this.specId = specId; this.sequenceNumber = 0; this.minSequenceNumber = 0; this.snapshotId = null; this.addedFilesCount = null; this.addedRowsCount = null; this.existingFilesCount = null; this.existingRowsCount = null; this.deletedFilesCount = null; this.deletedRowsCount = null; this.partitions = null; this.fromProjectionPos = null; }
Example #5
Source File: OrcIterable.java From iceberg with Apache License 2.0 | 6 votes |
private static VectorizedRowBatchIterator newOrcIterator(InputFile file, TypeDescription readerSchema, Long start, Long length, Reader orcFileReader, SearchArgument sarg) { final Reader.Options options = orcFileReader.options(); if (start != null) { options.range(start, length); } options.schema(readerSchema); options.searchArgument(sarg, new String[]{}); try { return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options)); } catch (IOException ioe) { throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file); } }
Example #6
Source File: TestMetrics.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testMetricsForNullColumns() throws IOException { Schema schema = new Schema( optional(1, "intCol", IntegerType.get()) ); Record firstRecord = GenericRecord.create(schema); firstRecord.setField("intCol", null); Record secondRecord = GenericRecord.create(schema); secondRecord.setField("intCol", null); InputFile recordsFile = writeRecords(schema, firstRecord, secondRecord); Metrics metrics = getMetrics(recordsFile); Assert.assertEquals(2L, (long) metrics.recordCount()); assertCounts(1, 2L, 2L, metrics); assertBounds(1, IntegerType.get(), null, null, metrics); }
Example #7
Source File: TestManifestListVersions.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testV1ForwardCompatibility() throws IOException { InputFile manifestList = writeManifestList(TEST_MANIFEST, 1); GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA); // v1 metadata should match even though order changed Assert.assertEquals("Path", PATH, generic.get("manifest_path").toString()); Assert.assertEquals("Length", LENGTH, generic.get("manifest_length")); Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id")); Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id")); Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count")); Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count")); Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count")); Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count")); Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count")); Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count")); Assert.assertNull("Content", generic.get(ManifestFile.MANIFEST_CONTENT.name())); Assert.assertNull("Sequence number", generic.get(ManifestFile.SEQUENCE_NUMBER.name())); Assert.assertNull("Min sequence number", generic.get(ManifestFile.MIN_SEQUENCE_NUMBER.name())); }
Example #8
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<InternalRow> newParquetIterable( InputFile location, FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) { Parquet.ReadBuilder builder = Parquet.read(location) .split(task.start(), task.length()) .project(readSchema) .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) .filter(task.residual()) .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } return builder.build(); }
Example #9
Source File: TestManifestListVersions.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testV2ForwardCompatibility() throws IOException { // v2 manifest list files can be read by v1 readers, but the sequence numbers and content will be ignored. InputFile manifestList = writeManifestList(TEST_MANIFEST, 2); GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA); // v1 metadata should match even though order changed Assert.assertEquals("Path", PATH, generic.get("manifest_path").toString()); Assert.assertEquals("Length", LENGTH, generic.get("manifest_length")); Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id")); Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id")); Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count")); Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count")); Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count")); Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count")); Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count")); Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count")); Assert.assertNull("Content", generic.get(ManifestFile.MANIFEST_CONTENT.name())); Assert.assertNull("Sequence number", generic.get(ManifestFile.SEQUENCE_NUMBER.name())); Assert.assertNull("Min sequence number", generic.get(ManifestFile.MIN_SEQUENCE_NUMBER.name())); }
Example #10
Source File: ManifestLists.java From iceberg with Apache License 2.0 | 6 votes |
static List<ManifestFile> read(InputFile manifestList) { try (CloseableIterable<ManifestFile> files = Avro.read(manifestList) .rename("manifest_file", GenericManifestFile.class.getName()) .rename("partitions", GenericPartitionFieldSummary.class.getName()) .rename("r508", GenericPartitionFieldSummary.class.getName()) .classLoader(GenericManifestFile.class.getClassLoader()) .project(ManifestFile.schema()) .reuseContainers(false) .build()) { return Lists.newLinkedList(files); } catch (IOException e) { throw new RuntimeIOException(e, "Cannot read manifest list file: %s", manifestList.location()); } }
Example #11
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) { ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile) .project(readSchema) .filter(task.residual()) .caseSensitive(caseSensitive) .split(task.start(), task.length()); // ORC does not support reuse containers yet switch (inMemoryDataModel) { case PIG: case HIVE: //TODO: implement value readers for Pig and Hive throw new UnsupportedOperationException("ORC support not yet supported for Pig and Hive"); case GENERIC: orcReadBuilder.createReaderFunc( fileSchema -> GenericOrcReader.buildReader( readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant))); } return applyResidualFiltering(orcReadBuilder.build(), task.residual(), readSchema); }
Example #12
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> newParquetIterable(InputFile inputFile, FileScanTask task, Schema readSchema) { Parquet.ReadBuilder parquetReadBuilder = Parquet.read(inputFile) .project(readSchema) .filter(task.residual()) .caseSensitive(caseSensitive) .split(task.start(), task.length()); if (reuseContainers) { parquetReadBuilder.reuseContainers(); } switch (inMemoryDataModel) { case PIG: case HIVE: //TODO implement value readers for Pig and Hive throw new UnsupportedOperationException("Parquet support not yet supported for Pig and Hive"); case GENERIC: parquetReadBuilder.createReaderFunc( fileSchema -> GenericParquetReaders.buildReader( readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant))); } return applyResidualFiltering(parquetReadBuilder.build(), task.residual(), readSchema); }
Example #13
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> newAvroIterable( InputFile inputFile, FileScanTask task, Schema readSchema) { Avro.ReadBuilder avroReadBuilder = Avro.read(inputFile) .project(readSchema) .split(task.start(), task.length()); if (reuseContainers) { avroReadBuilder.reuseContainers(); } switch (inMemoryDataModel) { case PIG: case HIVE: //TODO implement value readers for Pig and Hive throw new UnsupportedOperationException("Avro support not yet supported for Pig and Hive"); case GENERIC: avroReadBuilder.createReaderFunc( (expIcebergSchema, expAvroSchema) -> DataReader.create(expIcebergSchema, expAvroSchema, constantsMap(task, IdentityPartitionConverters::convertConstant))); } return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema); }
Example #14
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) { DataFile file = currentTask.file(); // TODO we should make use of FileIO to create inputFile InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration()); CloseableIterable<T> iterable; switch (file.format()) { case AVRO: iterable = newAvroIterable(inputFile, currentTask, readSchema); break; case ORC: iterable = newOrcIterable(inputFile, currentTask, readSchema); break; case PARQUET: iterable = newParquetIterable(inputFile, currentTask, readSchema); break; default: throw new UnsupportedOperationException( String.format("Cannot read %s file: %s", file.format().name(), file.path())); } return iterable; }
Example #15
Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0 | 6 votes |
public void createOrcInputFile(List<Record> records) throws IOException { if (ORC_FILE.exists()) { Assert.assertTrue(ORC_FILE.delete()); } OutputFile outFile = Files.localOutput(ORC_FILE); try (FileAppender<Record> appender = ORC.write(outFile) .schema(FILE_SCHEMA) .createWriterFunc(GenericOrcWriter::buildWriter) .build()) { appender.addAll(records); } InputFile inFile = Files.localInput(ORC_FILE); try (Reader reader = OrcFile.createReader(new Path(inFile.location()), OrcFile.readerOptions(new Configuration()))) { Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size()); } ORC_FILE.deleteOnExit(); }
Example #16
Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0 | 6 votes |
public void createParquetInputFile(List<Record> records) throws IOException { if (PARQUET_FILE.exists()) { Assert.assertTrue(PARQUET_FILE.delete()); } OutputFile outFile = Files.localOutput(PARQUET_FILE); try (FileAppender<Record> appender = Parquet.write(outFile) .schema(FILE_SCHEMA) .createWriterFunc(GenericParquetWriter::buildWriter) .build()) { appender.addAll(records); } InputFile inFile = Files.localInput(PARQUET_FILE); try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) { Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size()); rowGroupMetadata = reader.getRowGroups().get(0); parquetSchema = reader.getFileMetaData().getSchema(); } PARQUET_FILE.deleteOnExit(); }
Example #17
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 5 votes |
private CloseableIterable<InternalRow> open(FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) { CloseableIterable<InternalRow> iter; if (task.isDataTask()) { iter = newDataIterable(task.asDataTask(), readSchema); } else { InputFile location = getInputFile(task); Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask"); switch (task.file().format()) { case PARQUET: iter = newParquetIterable(location, task, readSchema, idToConstant); break; case AVRO: iter = newAvroIterable(location, task, readSchema, idToConstant); break; case ORC: iter = newOrcIterable(location, task, readSchema, idToConstant); break; default: throw new UnsupportedOperationException( "Cannot read unknown format: " + task.file().format()); } } return iter; }
Example #18
Source File: Spark3Util.java From iceberg with Apache License 2.0 | 5 votes |
public static boolean isLocalityEnabled(FileIO io, String location, CaseInsensitiveStringMap readOptions) { InputFile in = io.newInputFile(location); if (in instanceof HadoopInputFile) { String scheme = ((HadoopInputFile) in).getFileSystem().getScheme(); return readOptions.getBoolean("locality", LOCALITY_WHITELIST_FS.contains(scheme)); } return false; }
Example #19
Source File: DataFiles.java From iceberg with Apache License 2.0 | 5 votes |
public static DataFile fromInputFile(InputFile file, PartitionData partition, long rowCount) { if (file instanceof HadoopInputFile) { return fromStat(((HadoopInputFile) file).getStat(), partition, rowCount); } String location = file.location(); FileFormat format = FileFormat.fromFileName(location); return new GenericDataFile( location, format, partition, rowCount, file.getLength()); }
Example #20
Source File: ReadConf.java From iceberg with Apache License 2.0 | 5 votes |
private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) { try { return ParquetFileReader.open(ParquetIO.file(file), options); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location()); } }
Example #21
Source File: ParquetReader.java From iceberg with Apache License 2.0 | 5 votes |
public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options, Function<MessageType, ParquetValueReader<?>> readerFunc, NameMapping nameMapping, Expression filter, boolean reuseContainers, boolean caseSensitive) { this.input = input; this.expectedSchema = expectedSchema; this.options = options; this.readerFunc = readerFunc; // replace alwaysTrue with null to avoid extra work evaluating a trivial filter this.filter = filter == Expressions.alwaysTrue() ? null : filter; this.reuseContainers = reuseContainers; this.caseSensitive = caseSensitive; this.nameMapping = nameMapping; }
Example #22
Source File: BaseRewriteManifests.java From iceberg with Apache License 2.0 | 5 votes |
private ManifestFile copyManifest(ManifestFile manifest) { TableMetadata current = ops.current(); InputFile toCopy = ops.io().newInputFile(manifest.path()); OutputFile newFile = newManifestOutput(); return ManifestFiles.copyRewriteManifest( current.formatVersion(), toCopy, specsById, newFile, snapshotId(), summaryBuilder); }
Example #23
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 5 votes |
private CloseableIterable<InternalRow> newOrcIterable( InputFile location, FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) { return ORC.read(location) .project(readSchema) .split(task.start(), task.length()) .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) .filter(task.residual()) .caseSensitive(caseSensitive) .build(); }
Example #24
Source File: DataFiles.java From iceberg with Apache License 2.0 | 5 votes |
public static DataFile fromInputFile(InputFile file, long rowCount) { if (file instanceof HadoopInputFile) { return fromStat(((HadoopInputFile) file).getStat(), rowCount); } String location = file.location(); FileFormat format = FileFormat.fromFileName(location); return new GenericDataFile(location, format, rowCount, file.getLength()); }
Example #25
Source File: TestManifestListVersions.java From iceberg with Apache License 2.0 | 5 votes |
private InputFile writeManifestList(ManifestFile manifest, int formatVersion) throws IOException { OutputFile manifestList = Files.localOutput(temp.newFile()); try (FileAppender<ManifestFile> writer = ManifestLists.write( formatVersion, manifestList, SNAPSHOT_ID, SNAPSHOT_ID - 1, formatVersion > 1 ? SEQ_NUM : 0)) { writer.add(manifest); } return manifestList.toInputFile(); }
Example #26
Source File: ManifestFiles.java From iceberg with Apache License 2.0 | 5 votes |
static ManifestFile copyRewriteManifest(int formatVersion, InputFile toCopy, Map<Integer, PartitionSpec> specsById, OutputFile outputFile, long snapshotId, SnapshotSummary.Builder summaryBuilder) { // for a rewritten manifest all snapshot ids should be set. use empty metadata to throw an exception if it is not InheritableMetadata inheritableMetadata = InheritableMetadataFactory.empty(); try (ManifestReader<DataFile> reader = new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) { return copyManifestInternal( formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.EXISTING); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location()); } }
Example #27
Source File: AvroIterable.java From iceberg with Apache License 2.0 | 5 votes |
AvroIterable(InputFile file, DatumReader<D> reader, Long start, Long length, boolean reuseContainers) { this.file = file; this.reader = reader; this.start = start; this.end = start != null ? start + length : null; this.reuseContainers = reuseContainers; }
Example #28
Source File: TableMetadataParser.java From iceberg with Apache License 2.0 | 5 votes |
public static TableMetadata read(FileIO io, InputFile file) { Codec codec = Codec.fromFileName(file.location()); try (InputStream is = codec == Codec.GZIP ? new GZIPInputStream(file.newStream()) : file.newStream()) { return fromJson(io, file, JsonUtil.mapper().readValue(is, JsonNode.class)); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to read file: %s", file); } }
Example #29
Source File: ManifestFiles.java From iceberg with Apache License 2.0 | 5 votes |
static ManifestFile copyAppendManifest(int formatVersion, InputFile toCopy, Map<Integer, PartitionSpec> specsById, OutputFile outputFile, long snapshotId, SnapshotSummary.Builder summaryBuilder) { // use metadata that will add the current snapshot's ID for the rewrite InheritableMetadata inheritableMetadata = InheritableMetadataFactory.forCopy(snapshotId); try (ManifestReader<DataFile> reader = new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) { return copyManifestInternal( formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.ADDED); } catch (IOException e) { throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location()); } }
Example #30
Source File: StaticDataTask.java From iceberg with Apache License 2.0 | 5 votes |
private StaticDataTask(InputFile metadata, StructLike[] rows) { this.metadataFile = DataFiles.builder() .withInputFile(metadata) .withRecordCount(rows.length) .withFormat(FileFormat.METADATA) .build(); this.rows = rows; }