org.apache.iceberg.io.InputFile Java Exaples

Source File: RowDataReader.java From iceberg with Apache License 2.0

7 votes

private CloseableIterable<InternalRow> newAvroIterable(
    InputFile location,
    FileScanTask task,
    Schema projection,
    Map<Integer, ?> idToConstant) {
  Avro.ReadBuilder builder = Avro.read(location)
      .reuseContainers()
      .project(projection)
      .split(task.start(), task.length())
      .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant));

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}

Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0

6 votes

private org.apache.parquet.io.InputFile parquetInputFile(InputFile inFile) {
  return new org.apache.parquet.io.InputFile() {
    @Override
    public long getLength() throws IOException {
      return inFile.getLength();
    }

    @Override
    public org.apache.parquet.io.SeekableInputStream newStream() throws IOException {
      SeekableInputStream stream = inFile.newStream();
      return new DelegatingSeekableInputStream(stream) {
        @Override
        public long getPos() throws IOException {
          return stream.getPos();
        }

        @Override
        public void seek(long newPos) throws IOException {
          stream.seek(newPos);
        }
      };
    }
  };
}

Source File: TableMetadata.java From iceberg with Apache License 2.0

6 votes

private List<MetadataLogEntry> addPreviousFile(InputFile previousFile, long timestampMillis,
                                               Map<String, String> updatedProperties) {
  if (previousFile == null) {
    return previousFiles;
  }

  int maxSize = Math.max(1, PropertyUtil.propertyAsInt(updatedProperties,
          TableProperties.METADATA_PREVIOUS_VERSIONS_MAX, TableProperties.METADATA_PREVIOUS_VERSIONS_MAX_DEFAULT));

  List<MetadataLogEntry> newMetadataLog;
  if (previousFiles.size() >= maxSize) {
    int removeIndex = previousFiles.size() - maxSize + 1;
    newMetadataLog = Lists.newArrayList(previousFiles.subList(removeIndex, previousFiles.size()));
  } else {
    newMetadataLog = Lists.newArrayList(previousFiles);
  }
  newMetadataLog.add(new MetadataLogEntry(timestampMillis, previousFile.location()));

  return newMetadataLog;
}

Source File: GenericManifestFile.java From iceberg with Apache License 2.0

6 votes

GenericManifestFile(InputFile file, int specId) {
  this.avroSchema = AVRO_SCHEMA;
  this.file = file;
  this.manifestPath = file.location();
  this.length = null; // lazily loaded from file
  this.specId = specId;
  this.sequenceNumber = 0;
  this.minSequenceNumber = 0;
  this.snapshotId = null;
  this.addedFilesCount = null;
  this.addedRowsCount = null;
  this.existingFilesCount = null;
  this.existingRowsCount = null;
  this.deletedFilesCount = null;
  this.deletedRowsCount = null;
  this.partitions = null;
  this.fromProjectionPos = null;
}

Source File: OrcIterable.java From iceberg with Apache License 2.0

6 votes

private static VectorizedRowBatchIterator newOrcIterator(InputFile file,
                                                         TypeDescription readerSchema,
                                                         Long start, Long length,
                                                         Reader orcFileReader, SearchArgument sarg) {
  final Reader.Options options = orcFileReader.options();
  if (start != null) {
    options.range(start, length);
  }
  options.schema(readerSchema);
  options.searchArgument(sarg, new String[]{});

  try {
    return new VectorizedRowBatchIterator(file.location(), readerSchema, orcFileReader.rows(options));
  } catch (IOException ioe) {
    throw new RuntimeIOException(ioe, "Failed to get ORC rows for file: %s", file);
  }
}

Source File: TestMetrics.java From iceberg with Apache License 2.0

6 votes

@Test
public void testMetricsForNullColumns() throws IOException {
  Schema schema = new Schema(
      optional(1, "intCol", IntegerType.get())
  );
  Record firstRecord = GenericRecord.create(schema);
  firstRecord.setField("intCol", null);
  Record secondRecord = GenericRecord.create(schema);
  secondRecord.setField("intCol", null);

  InputFile recordsFile = writeRecords(schema, firstRecord, secondRecord);

  Metrics metrics = getMetrics(recordsFile);
  Assert.assertEquals(2L, (long) metrics.recordCount());
  assertCounts(1, 2L, 2L, metrics);
  assertBounds(1, IntegerType.get(), null, null, metrics);
}

Source File: TestManifestListVersions.java From iceberg with Apache License 2.0

6 votes

@Test
public void testV1ForwardCompatibility() throws IOException {
  InputFile manifestList = writeManifestList(TEST_MANIFEST, 1);
  GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA);

  // v1 metadata should match even though order changed
  Assert.assertEquals("Path", PATH, generic.get("manifest_path").toString());
  Assert.assertEquals("Length", LENGTH, generic.get("manifest_length"));
  Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id"));
  Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id"));
  Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count"));
  Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count"));
  Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count"));
  Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count"));
  Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count"));
  Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count"));
  Assert.assertNull("Content", generic.get(ManifestFile.MANIFEST_CONTENT.name()));
  Assert.assertNull("Sequence number", generic.get(ManifestFile.SEQUENCE_NUMBER.name()));
  Assert.assertNull("Min sequence number", generic.get(ManifestFile.MIN_SEQUENCE_NUMBER.name()));
}

Source File: RowDataReader.java From iceberg with Apache License 2.0

6 votes

private CloseableIterable<InternalRow> newParquetIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  Parquet.ReadBuilder builder = Parquet.read(location)
      .split(task.start(), task.length())
      .project(readSchema)
      .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive);

  if (nameMapping != null) {
    builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
  }

  return builder.build();
}

Source File: TestManifestListVersions.java From iceberg with Apache License 2.0

6 votes

@Test
public void testV2ForwardCompatibility() throws IOException {
  // v2 manifest list files can be read by v1 readers, but the sequence numbers and content will be ignored.
  InputFile manifestList = writeManifestList(TEST_MANIFEST, 2);
  GenericData.Record generic = readGeneric(manifestList, V1Metadata.MANIFEST_LIST_SCHEMA);

  // v1 metadata should match even though order changed
  Assert.assertEquals("Path", PATH, generic.get("manifest_path").toString());
  Assert.assertEquals("Length", LENGTH, generic.get("manifest_length"));
  Assert.assertEquals("Spec id", SPEC_ID, generic.get("partition_spec_id"));
  Assert.assertEquals("Snapshot id", SNAPSHOT_ID, (long) generic.get("added_snapshot_id"));
  Assert.assertEquals("Added files count", ADDED_FILES, (int) generic.get("added_data_files_count"));
  Assert.assertEquals("Existing files count", EXISTING_FILES, (int) generic.get("existing_data_files_count"));
  Assert.assertEquals("Deleted files count", DELETED_FILES, (int) generic.get("deleted_data_files_count"));
  Assert.assertEquals("Added rows count", ADDED_ROWS, (long) generic.get("added_rows_count"));
  Assert.assertEquals("Existing rows count", EXISTING_ROWS, (long) generic.get("existing_rows_count"));
  Assert.assertEquals("Deleted rows count", DELETED_ROWS, (long) generic.get("deleted_rows_count"));
  Assert.assertNull("Content", generic.get(ManifestFile.MANIFEST_CONTENT.name()));
  Assert.assertNull("Sequence number", generic.get(ManifestFile.SEQUENCE_NUMBER.name()));
  Assert.assertNull("Min sequence number", generic.get(ManifestFile.MIN_SEQUENCE_NUMBER.name()));
}

Source File: ManifestLists.java From iceberg with Apache License 2.0

6 votes

static List<ManifestFile> read(InputFile manifestList) {
  try (CloseableIterable<ManifestFile> files = Avro.read(manifestList)
      .rename("manifest_file", GenericManifestFile.class.getName())
      .rename("partitions", GenericPartitionFieldSummary.class.getName())
      .rename("r508", GenericPartitionFieldSummary.class.getName())
      .classLoader(GenericManifestFile.class.getClassLoader())
      .project(ManifestFile.schema())
      .reuseContainers(false)
      .build()) {

    return Lists.newLinkedList(files);

  } catch (IOException e) {
    throw new RuntimeIOException(e, "Cannot read manifest list file: %s", manifestList.location());
  }
}

Source File: IcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  // ORC does not support reuse containers yet
  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO: implement value readers for Pig and Hive
      throw new UnsupportedOperationException("ORC support not yet supported for Pig and Hive");
    case GENERIC:
      orcReadBuilder.createReaderFunc(
          fileSchema -> GenericOrcReader.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }

  return applyResidualFiltering(orcReadBuilder.build(), task.residual(), readSchema);
}

Source File: IcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

private CloseableIterable<T> newParquetIterable(InputFile inputFile, FileScanTask task, Schema readSchema) {
  Parquet.ReadBuilder parquetReadBuilder = Parquet.read(inputFile)
      .project(readSchema)
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .split(task.start(), task.length());
  if (reuseContainers) {
    parquetReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Parquet support not yet supported for Pig and Hive");
    case GENERIC:
      parquetReadBuilder.createReaderFunc(
          fileSchema -> GenericParquetReaders.buildReader(
              readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(parquetReadBuilder.build(), task.residual(), readSchema);
}

Source File: IcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

private CloseableIterable<T> newAvroIterable(
    InputFile inputFile, FileScanTask task, Schema readSchema) {
  Avro.ReadBuilder avroReadBuilder = Avro.read(inputFile)
      .project(readSchema)
      .split(task.start(), task.length());
  if (reuseContainers) {
    avroReadBuilder.reuseContainers();
  }

  switch (inMemoryDataModel) {
    case PIG:
    case HIVE:
      //TODO implement value readers for Pig and Hive
      throw new UnsupportedOperationException("Avro support not yet supported for Pig and Hive");
    case GENERIC:
      avroReadBuilder.createReaderFunc(
          (expIcebergSchema, expAvroSchema) ->
              DataReader.create(expIcebergSchema, expAvroSchema,
                  constantsMap(task, IdentityPartitionConverters::convertConstant)));
  }
  return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema);
}

Source File: IcebergInputFormat.java From iceberg with Apache License 2.0

6 votes

private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) {
  DataFile file = currentTask.file();
  // TODO we should make use of FileIO to create inputFile
  InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration());
  CloseableIterable<T> iterable;
  switch (file.format()) {
    case AVRO:
      iterable = newAvroIterable(inputFile, currentTask, readSchema);
      break;
    case ORC:
      iterable = newOrcIterable(inputFile, currentTask, readSchema);
      break;
    case PARQUET:
      iterable = newParquetIterable(inputFile, currentTask, readSchema);
      break;
    default:
      throw new UnsupportedOperationException(
          String.format("Cannot read %s file: %s", file.format().name(), file.path()));
  }

  return iterable;
}

Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0

6 votes

public void createOrcInputFile(List<Record> records) throws IOException {
  if (ORC_FILE.exists()) {
    Assert.assertTrue(ORC_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(ORC_FILE);
  try (FileAppender<Record> appender = ORC.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericOrcWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(ORC_FILE);
  try (Reader reader = OrcFile.createReader(new Path(inFile.location()),
      OrcFile.readerOptions(new Configuration()))) {
    Assert.assertEquals("Should create only one stripe", 1, reader.getStripes().size());
  }

  ORC_FILE.deleteOnExit();
}

Source File: TestMetricsRowGroupFilterTypes.java From iceberg with Apache License 2.0

6 votes

public void createParquetInputFile(List<Record> records) throws IOException {
  if (PARQUET_FILE.exists()) {
    Assert.assertTrue(PARQUET_FILE.delete());
  }

  OutputFile outFile = Files.localOutput(PARQUET_FILE);
  try (FileAppender<Record> appender = Parquet.write(outFile)
      .schema(FILE_SCHEMA)
      .createWriterFunc(GenericParquetWriter::buildWriter)
      .build()) {
    appender.addAll(records);
  }

  InputFile inFile = Files.localInput(PARQUET_FILE);
  try (ParquetFileReader reader = ParquetFileReader.open(parquetInputFile(inFile))) {
    Assert.assertEquals("Should create only one row group", 1, reader.getRowGroups().size());
    rowGroupMetadata = reader.getRowGroups().get(0);
    parquetSchema = reader.getFileMetaData().getSchema();
  }

  PARQUET_FILE.deleteOnExit();
}

Source File: RowDataReader.java From iceberg with Apache License 2.0

5 votes

private CloseableIterable<InternalRow> open(FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) {
  CloseableIterable<InternalRow> iter;
  if (task.isDataTask()) {
    iter = newDataIterable(task.asDataTask(), readSchema);
  } else {
    InputFile location = getInputFile(task);
    Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");

    switch (task.file().format()) {
      case PARQUET:
        iter = newParquetIterable(location, task, readSchema, idToConstant);
        break;

      case AVRO:
        iter = newAvroIterable(location, task, readSchema, idToConstant);
        break;

      case ORC:
        iter = newOrcIterable(location, task, readSchema, idToConstant);
        break;

      default:
        throw new UnsupportedOperationException(
            "Cannot read unknown format: " + task.file().format());
    }
  }

  return iter;
}

Source File: Spark3Util.java From iceberg with Apache License 2.0

5 votes

public static boolean isLocalityEnabled(FileIO io, String location, CaseInsensitiveStringMap readOptions) {
  InputFile in = io.newInputFile(location);
  if (in instanceof HadoopInputFile) {
    String scheme = ((HadoopInputFile) in).getFileSystem().getScheme();
    return readOptions.getBoolean("locality", LOCALITY_WHITELIST_FS.contains(scheme));
  }
  return false;
}

Source File: DataFiles.java From iceberg with Apache License 2.0

5 votes

public static DataFile fromInputFile(InputFile file, PartitionData partition, long rowCount) {
  if (file instanceof HadoopInputFile) {
    return fromStat(((HadoopInputFile) file).getStat(), partition, rowCount);
  }

  String location = file.location();
  FileFormat format = FileFormat.fromFileName(location);
  return new GenericDataFile(
      location, format, partition, rowCount, file.getLength());
}

Source File: ReadConf.java From iceberg with Apache License 2.0

5 votes

private static ParquetFileReader newReader(InputFile file, ParquetReadOptions options) {
  try {
    return ParquetFileReader.open(ParquetIO.file(file), options);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to open Parquet file: %s", file.location());
  }
}

Source File: ParquetReader.java From iceberg with Apache License 2.0

5 votes

public ParquetReader(InputFile input, Schema expectedSchema, ParquetReadOptions options,
                     Function<MessageType, ParquetValueReader<?>> readerFunc, NameMapping nameMapping,
                     Expression filter, boolean reuseContainers, boolean caseSensitive) {
  this.input = input;
  this.expectedSchema = expectedSchema;
  this.options = options;
  this.readerFunc = readerFunc;
  // replace alwaysTrue with null to avoid extra work evaluating a trivial filter
  this.filter = filter == Expressions.alwaysTrue() ? null : filter;
  this.reuseContainers = reuseContainers;
  this.caseSensitive = caseSensitive;
  this.nameMapping = nameMapping;
}

Source File: BaseRewriteManifests.java From iceberg with Apache License 2.0

5 votes

private ManifestFile copyManifest(ManifestFile manifest) {
  TableMetadata current = ops.current();
  InputFile toCopy = ops.io().newInputFile(manifest.path());
  OutputFile newFile = newManifestOutput();
  return ManifestFiles.copyRewriteManifest(
      current.formatVersion(), toCopy, specsById, newFile, snapshotId(), summaryBuilder);
}

Source File: RowDataReader.java From iceberg with Apache License 2.0

5 votes

private CloseableIterable<InternalRow> newOrcIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  return ORC.read(location)
      .project(readSchema)
      .split(task.start(), task.length())
      .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .build();
}

Source File: DataFiles.java From iceberg with Apache License 2.0

5 votes

public static DataFile fromInputFile(InputFile file, long rowCount) {
  if (file instanceof HadoopInputFile) {
    return fromStat(((HadoopInputFile) file).getStat(), rowCount);
  }

  String location = file.location();
  FileFormat format = FileFormat.fromFileName(location);
  return new GenericDataFile(location, format, rowCount, file.getLength());
}

Source File: TestManifestListVersions.java From iceberg with Apache License 2.0

5 votes

private InputFile writeManifestList(ManifestFile manifest, int formatVersion) throws IOException {
  OutputFile manifestList = Files.localOutput(temp.newFile());
  try (FileAppender<ManifestFile> writer = ManifestLists.write(
      formatVersion, manifestList, SNAPSHOT_ID, SNAPSHOT_ID - 1, formatVersion > 1 ? SEQ_NUM : 0)) {
    writer.add(manifest);
  }
  return manifestList.toInputFile();
}

Source File: ManifestFiles.java From iceberg with Apache License 2.0

5 votes

static ManifestFile copyRewriteManifest(int formatVersion,
                                        InputFile toCopy, Map<Integer, PartitionSpec> specsById,
                                        OutputFile outputFile, long snapshotId,
                                        SnapshotSummary.Builder summaryBuilder) {
  // for a rewritten manifest all snapshot ids should be set. use empty metadata to throw an exception if it is not
  InheritableMetadata inheritableMetadata = InheritableMetadataFactory.empty();
  try (ManifestReader<DataFile> reader =
           new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) {
    return copyManifestInternal(
        formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.EXISTING);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location());
  }
}

Source File: AvroIterable.java From iceberg with Apache License 2.0

5 votes

AvroIterable(InputFile file, DatumReader<D> reader,
             Long start, Long length, boolean reuseContainers) {
  this.file = file;
  this.reader = reader;
  this.start = start;
  this.end = start != null ? start + length : null;
  this.reuseContainers = reuseContainers;
}

Source File: TableMetadataParser.java From iceberg with Apache License 2.0

5 votes

public static TableMetadata read(FileIO io, InputFile file) {
  Codec codec = Codec.fromFileName(file.location());
  try (InputStream is = codec == Codec.GZIP ? new GZIPInputStream(file.newStream()) : file.newStream()) {
    return fromJson(io, file, JsonUtil.mapper().readValue(is, JsonNode.class));
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to read file: %s", file);
  }
}

Source File: ManifestFiles.java From iceberg with Apache License 2.0

5 votes

static ManifestFile copyAppendManifest(int formatVersion,
                                       InputFile toCopy, Map<Integer, PartitionSpec> specsById,
                                       OutputFile outputFile, long snapshotId,
                                       SnapshotSummary.Builder summaryBuilder) {
  // use metadata that will add the current snapshot's ID for the rewrite
  InheritableMetadata inheritableMetadata = InheritableMetadataFactory.forCopy(snapshotId);
  try (ManifestReader<DataFile> reader =
           new ManifestReader<>(toCopy, specsById, inheritableMetadata, FileType.DATA_FILES)) {
    return copyManifestInternal(
        formatVersion, reader, outputFile, snapshotId, summaryBuilder, ManifestEntry.Status.ADDED);
  } catch (IOException e) {
    throw new RuntimeIOException(e, "Failed to close manifest: %s", toCopy.location());
  }
}

Source File: StaticDataTask.java From iceberg with Apache License 2.0

5 votes

private StaticDataTask(InputFile metadata, StructLike[] rows) {
  this.metadataFile = DataFiles.builder()
      .withInputFile(metadata)
      .withRecordCount(rows.length)
      .withFormat(FileFormat.METADATA)
      .build();
  this.rows = rows;
}

org.apache.iceberg.io.InputFile Java Examples