org.apache.iceberg.FileScanTask Java Examples
The following examples show how to use
org.apache.iceberg.FileScanTask.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 7 votes |
private CloseableIterable<InternalRow> newAvroIterable( InputFile location, FileScanTask task, Schema projection, Map<Integer, ?> idToConstant) { Avro.ReadBuilder builder = Avro.read(location) .reuseContainers() .project(projection) .split(task.start(), task.length()) .createReaderFunc(readSchema -> new SparkAvroReader(projection, readSchema, idToConstant)); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } return builder.build(); }
Example #2
Source File: Util.java From iceberg with Apache License 2.0 | 6 votes |
public static String[] blockLocations(CombinedScanTask task, Configuration conf) { Set<String> locationSets = Sets.newHashSet(); for (FileScanTask f : task.files()) { Path path = new Path(f.file().path().toString()); try { FileSystem fs = path.getFileSystem(conf); for (BlockLocation b : fs.getFileBlockLocations(path, f.start(), f.length())) { locationSets.addAll(Arrays.asList(b.getHosts())); } } catch (IOException ioe) { LOG.warn("Failed to get block locations for path {}", path, ioe); } } return locationSets.toArray(new String[0]); }
Example #3
Source File: RewriteDataFilesAction.java From iceberg with Apache License 2.0 | 6 votes |
private Map<StructLikeWrapper, Collection<FileScanTask>> groupTasksByPartition( CloseableIterator<FileScanTask> tasksIter) { ListMultimap<StructLikeWrapper, FileScanTask> tasksGroupedByPartition = Multimaps.newListMultimap( Maps.newHashMap(), Lists::newArrayList); try { tasksIter.forEachRemaining(task -> { StructLikeWrapper structLike = StructLikeWrapper.wrap(task.file().partition()); tasksGroupedByPartition.put(structLike, task); }); } finally { try { tasksIter.close(); } catch (IOException ioe) { LOG.warn("Failed to close task iterator", ioe); } } return tasksGroupedByPartition.asMap(); }
Example #4
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<InternalRow> newParquetIterable( InputFile location, FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) { Parquet.ReadBuilder builder = Parquet.read(location) .split(task.start(), task.length()) .project(readSchema) .createReaderFunc(fileSchema -> SparkParquetReaders.buildReader(readSchema, fileSchema, idToConstant)) .filter(task.residual()) .caseSensitive(caseSensitive); if (nameMapping != null) { builder.withNameMapping(NameMappingParser.fromJson(nameMapping)); } return builder.build(); }
Example #5
Source File: IcebergTableWrapper.java From dremio-oss with Apache License 2.0 | 6 votes |
private void buildPartitionsAndSplits() throws IOException { PartitionConverter partitionConverter = new PartitionConverter(schema); SplitConverter splitConverter = new SplitConverter(context, fs, schema, datasetColumnValueCounts); // map of distinct partition values. // iterate over all data files to get the partition values and them to the map. // TODO ravindra: this iteration requires reading all of the manifest files. This should go via // the dremio wrappers. for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { List<PartitionValue> partition = partitionConverter.from(task); DatasetSplit split = splitConverter.from(task); partitionChunkListing.put(partition, split); recordCount += task.file().recordCount(); } }
Example #6
Source File: TestHadoopCommits.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testCanReadOldCompressedManifestFiles() throws Exception { assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); // do a file append table.newAppend() .appendFile(FILE_A) .commit(); // since we don't generate old file extensions anymore, let's convert existing metadata to old .metadata.json.gz // to test backwards compatibility rewriteMetadataAsGzipWithOldExtension(); List<File> metadataFiles = listMetadataJsonFiles(); assertEquals("Should have two versions", 2, metadataFiles.size()); assertTrue("Metadata should be compressed with old format.", metadataFiles.stream().allMatch(f -> f.getName().endsWith(".metadata.json.gz"))); Table reloaded = TABLES.load(tableLocation); List<FileScanTask> tasks = Lists.newArrayList(reloaded.newScan().planFiles()); Assert.assertEquals("Should scan 1 files", 1, tasks.size()); }
Example #7
Source File: TestHadoopCommits.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testMergeAppend() throws Exception { testFastAppend(); // create 2 compatible manifest files that will be merged // merge all manifests for this test table.updateProperties().set("commit.manifest.min-count-to-merge", "1").commit(); // third append table.newAppend() .appendFile(FILE_C) .commit(); List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should scan 3 files", 3, tasks.size()); Assert.assertEquals("Should contain 3 Avro manifest files", 3, listManifestFiles().size()); TableMetadata metadata = readMetadataVersion(5); Assert.assertEquals("Current snapshot should contain 1 merged manifest", 1, metadata.currentSnapshot().allManifests().size()); }
Example #8
Source File: TestHadoopCommits.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testSchemaUpdate() throws Exception { Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); table.updateSchema() .addColumn("n", Types.IntegerType.get()) .commit(); Assert.assertTrue("Should create v2 for the update", version(2).exists() && version(2).isFile()); Assert.assertEquals("Should write the current version to the hint file", 2, readVersionHint()); Assert.assertEquals("Table schema should match schema with reassigned ids", UPDATED_SCHEMA.asStruct(), table.schema().asStruct()); List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should not create any scan tasks", 0, tasks.size()); List<File> manifests = listManifestFiles(); Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size()); }
Example #9
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> open(FileScanTask currentTask, Schema readSchema) { DataFile file = currentTask.file(); // TODO we should make use of FileIO to create inputFile InputFile inputFile = HadoopInputFile.fromLocation(file.path(), context.getConfiguration()); CloseableIterable<T> iterable; switch (file.format()) { case AVRO: iterable = newAvroIterable(inputFile, currentTask, readSchema); break; case ORC: iterable = newOrcIterable(inputFile, currentTask, readSchema); break; case PARQUET: iterable = newParquetIterable(inputFile, currentTask, readSchema); break; default: throw new UnsupportedOperationException( String.format("Cannot read %s file: %s", file.format().name(), file.path())); } return iterable; }
Example #10
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> newAvroIterable( InputFile inputFile, FileScanTask task, Schema readSchema) { Avro.ReadBuilder avroReadBuilder = Avro.read(inputFile) .project(readSchema) .split(task.start(), task.length()); if (reuseContainers) { avroReadBuilder.reuseContainers(); } switch (inMemoryDataModel) { case PIG: case HIVE: //TODO implement value readers for Pig and Hive throw new UnsupportedOperationException("Avro support not yet supported for Pig and Hive"); case GENERIC: avroReadBuilder.createReaderFunc( (expIcebergSchema, expAvroSchema) -> DataReader.create(expIcebergSchema, expAvroSchema, constantsMap(task, IdentityPartitionConverters::convertConstant))); } return applyResidualFiltering(avroReadBuilder.build(), task.residual(), readSchema); }
Example #11
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> newParquetIterable(InputFile inputFile, FileScanTask task, Schema readSchema) { Parquet.ReadBuilder parquetReadBuilder = Parquet.read(inputFile) .project(readSchema) .filter(task.residual()) .caseSensitive(caseSensitive) .split(task.start(), task.length()); if (reuseContainers) { parquetReadBuilder.reuseContainers(); } switch (inMemoryDataModel) { case PIG: case HIVE: //TODO implement value readers for Pig and Hive throw new UnsupportedOperationException("Parquet support not yet supported for Pig and Hive"); case GENERIC: parquetReadBuilder.createReaderFunc( fileSchema -> GenericParquetReaders.buildReader( readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant))); } return applyResidualFiltering(parquetReadBuilder.build(), task.residual(), readSchema); }
Example #12
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 6 votes |
private CloseableIterable<T> newOrcIterable(InputFile inputFile, FileScanTask task, Schema readSchema) { ORC.ReadBuilder orcReadBuilder = ORC.read(inputFile) .project(readSchema) .filter(task.residual()) .caseSensitive(caseSensitive) .split(task.start(), task.length()); // ORC does not support reuse containers yet switch (inMemoryDataModel) { case PIG: case HIVE: //TODO: implement value readers for Pig and Hive throw new UnsupportedOperationException("ORC support not yet supported for Pig and Hive"); case GENERIC: orcReadBuilder.createReaderFunc( fileSchema -> GenericOrcReader.buildReader( readSchema, fileSchema, constantsMap(task, IdentityPartitionConverters::convertConstant))); } return applyResidualFiltering(orcReadBuilder.build(), task.residual(), readSchema); }
Example #13
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 6 votes |
@Override CloseableIterator<InternalRow> open(FileScanTask task) { DataFile file = task.file(); // update the current file for Spark's filename() function InputFileBlockHolder.set(file.path().toString(), task.start(), task.length()); // schema or rows returned by readers PartitionSpec spec = task.spec(); Set<Integer> idColumns = spec.identitySourceIds(); Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns); boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty(); if (projectsIdentityPartitionColumns) { return open(task, expectedSchema, PartitionUtil.constantsMap(task, RowDataReader::convertConstant)) .iterator(); } // return the base iterator return open(task, expectedSchema, ImmutableMap.of()).iterator(); }
Example #14
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testNoMetricsCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "none"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertTrue(file.nullValueCounts().isEmpty()); Assert.assertTrue(file.valueCounts().isEmpty()); Assert.assertTrue(file.lowerBounds().isEmpty()); Assert.assertTrue(file.upperBounds().isEmpty()); } }
Example #15
Source File: SparkBatchScan.java From iceberg with Apache License 2.0 | 5 votes |
@Override public Statistics estimateStatistics() { long sizeInBytes = 0L; long numRows = 0L; for (CombinedScanTask task : tasks()) { for (FileScanTask file : task.files()) { sizeInBytes += file.length(); numRows += file.file().recordCount(); } } return new Stats(sizeInBytes, numRows); }
Example #16
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testCustomMetricCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); properties.put("write.metadata.metrics.column.id", "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); Schema schema = table.schema(); Types.NestedField id = schema.findField("id"); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertEquals(2, file.nullValueCounts().size()); Assert.assertEquals(2, file.valueCounts().size()); Assert.assertEquals(1, file.lowerBounds().size()); Assert.assertTrue(file.lowerBounds().containsKey(id.fieldId())); Assert.assertEquals(1, file.upperBounds().size()); Assert.assertTrue(file.upperBounds().containsKey(id.fieldId())); } }
Example #17
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testFullMetricsCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "full"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertEquals(2, file.nullValueCounts().size()); Assert.assertEquals(2, file.valueCounts().size()); Assert.assertEquals(2, file.lowerBounds().size()); Assert.assertEquals(2, file.upperBounds().size()); } }
Example #18
Source File: TestWriteMetricsConfig.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testCountMetricsCollectionForParquet() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> properties = Maps.newHashMap(); properties.put(TableProperties.DEFAULT_WRITE_METRICS_MODE, "counts"); Table table = tables.create(SIMPLE_SCHEMA, spec, properties, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data") .coalesce(1) .write() .format("iceberg") .option("write-format", "parquet") .mode("append") .save(tableLocation); for (FileScanTask task : table.newScan().includeColumnStats().planFiles()) { DataFile file = task.file(); Assert.assertEquals(2, file.nullValueCounts().size()); Assert.assertEquals(2, file.valueCounts().size()); Assert.assertTrue(file.lowerBounds().isEmpty()); Assert.assertTrue(file.upperBounds().isEmpty()); } }
Example #19
Source File: TestDataSourceOptions.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testNoWriteFormatOption() throws IOException { String tableLocation = temp.newFolder("iceberg-table").toString(); HadoopTables tables = new HadoopTables(CONF); PartitionSpec spec = PartitionSpec.unpartitioned(); Map<String, String> options = Maps.newHashMap(); options.put(TableProperties.DEFAULT_FILE_FORMAT, "avro"); Table table = tables.create(SCHEMA, spec, options, tableLocation); List<SimpleRecord> expectedRecords = Lists.newArrayList( new SimpleRecord(1, "a"), new SimpleRecord(2, "b"), new SimpleRecord(3, "c") ); Dataset<Row> df = spark.createDataFrame(expectedRecords, SimpleRecord.class); df.select("id", "data").write() .format("iceberg") .mode("append") .save(tableLocation); try (CloseableIterable<FileScanTask> tasks = table.newScan().planFiles()) { tasks.forEach(task -> { FileFormat fileFormat = FileFormat.fromFileName(task.file().path()); Assert.assertEquals(FileFormat.AVRO, fileFormat); }); } }
Example #20
Source File: Util.java From iceberg with Apache License 2.0 | 5 votes |
public static String[] blockLocations(FileIO io, CombinedScanTask task) { Set<String> locations = Sets.newHashSet(); for (FileScanTask f : task.files()) { InputFile in = io.newInputFile(f.file().path().toString()); if (in instanceof HadoopInputFile) { Collections.addAll(locations, ((HadoopInputFile) in).getBlockLocations(f.start(), f.length())); } } return locations.toArray(HadoopInputFile.NO_LOCATION_PREFERENCE); }
Example #21
Source File: TableScanUtil.java From iceberg with Apache License 2.0 | 5 votes |
public static CloseableIterable<FileScanTask> splitFiles(CloseableIterable<FileScanTask> tasks, long splitSize) { Iterable<FileScanTask> splitTasks = FluentIterable .from(tasks) .transformAndConcat(input -> input.split(splitSize)); // Capture manifests which can be closed after scan planning return CloseableIterable.combine(splitTasks, tasks); }
Example #22
Source File: TableScanUtil.java From iceberg with Apache License 2.0 | 5 votes |
public static CloseableIterable<CombinedScanTask> planTasks(CloseableIterable<FileScanTask> splitFiles, long splitSize, int lookback, long openFileCost) { Function<FileScanTask, Long> weightFunc = file -> Math.max(file.length(), openFileCost); return CloseableIterable.transform( CloseableIterable.combine( new BinPacking.PackingIterable<>(splitFiles, splitSize, lookback, weightFunc, true), splitFiles), BaseCombinedScanTask::new); }
Example #23
Source File: TestHadoopCommits.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testCreateTable() throws Exception { PartitionSpec expectedSpec = PartitionSpec.builderFor(TABLE_SCHEMA) .bucket("data", 16) .build(); Assert.assertEquals("Table schema should match schema with reassigned ids", TABLE_SCHEMA.asStruct(), table.schema().asStruct()); Assert.assertEquals("Table partition spec should match with reassigned ids", expectedSpec, table.spec()); List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should not create any scan tasks", 0, tasks.size()); Assert.assertTrue("Table location should exist", tableDir.exists()); Assert.assertTrue("Should create metadata folder", metadataDir.exists() && metadataDir.isDirectory()); Assert.assertTrue("Should create v1 metadata", version(1).exists() && version(1).isFile()); Assert.assertFalse("Should not create v2 or newer versions", version(2).exists()); Assert.assertTrue("Should create version hint file", versionHintFile.exists()); Assert.assertEquals("Should write the current version to the hint file", 1, readVersionHint()); List<File> manifests = listManifestFiles(); Assert.assertEquals("Should contain 0 Avro manifest files", 0, manifests.size()); }
Example #24
Source File: TestHadoopCommits.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testFastAppend() throws Exception { // first append table.newFastAppend() .appendFile(FILE_A) .commit(); Assert.assertTrue("Should create v2 for the update", version(2).exists() && version(2).isFile()); Assert.assertEquals("Should write the current version to the hint file", 2, readVersionHint()); List<FileScanTask> tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should scan 1 file", 1, tasks.size()); List<File> manifests = listManifestFiles(); Assert.assertEquals("Should contain only one Avro manifest file", 1, manifests.size()); // second append table.newFastAppend() .appendFile(FILE_B) .commit(); Assert.assertTrue("Should create v3 for the update", version(3).exists() && version(3).isFile()); Assert.assertEquals("Should write the current version to the hint file", 3, readVersionHint()); tasks = Lists.newArrayList(table.newScan().planFiles()); Assert.assertEquals("Should scan 2 files", 2, tasks.size()); Assert.assertEquals("Should contain 2 Avro manifest files", 2, listManifestFiles().size()); TableMetadata metadata = readMetadataVersion(3); Assert.assertEquals("Current snapshot should contain 2 manifests", 2, metadata.currentSnapshot().allManifests().size()); }
Example #25
Source File: TestInsertIntoTable.java From dremio-oss with Apache License 2.0 | 5 votes |
private void checkSinglePartitionValue(File tableFolder, Class expectedClass, Object expectedValue) { Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath()); for (FileScanTask fileScanTask : table.newScan().planFiles()) { StructLike structLike = fileScanTask.file().partition(); Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue)); } }
Example #26
Source File: TestIcebergCTASWithPartition.java From dremio-oss with Apache License 2.0 | 5 votes |
private void verifyPartitionValue(String tableFolder, Class expectedClass, Object expectedValue) { Table table = new HadoopTables(new Configuration()).load(tableFolder); for (FileScanTask fileScanTask : table.newScan().planFiles()) { StructLike structLike = fileScanTask.file().partition(); Assert.assertEquals(structLike.get(0, expectedClass), expectedValue); } }
Example #27
Source File: TestIcebergPartitionData.java From dremio-oss with Apache License 2.0 | 5 votes |
private void verifyPartitionValue(PartitionSpec partitionSpec, IcebergPartitionData partitionData, String columnName, Class expectedClass, Object expectedValue) throws Exception { File tableFolder = new File(folder.getRoot(), "icebergPartitionTest"); try { tableFolder.mkdir(); File dataFile = new File(folder.getRoot(), "a.parquet"); dataFile.createNewFile(); DataFile d1 = DataFiles.builder(partitionSpec) .withInputFile(Files.localInput(dataFile)) .withRecordCount(50) .withFormat(FileFormat.PARQUET) .withPartition(partitionData) .build(); IcebergOpCommitter committer = IcebergOperation.getCreateTableCommitter(Path.of(tableFolder.toPath().toString()), (new SchemaConverter()).fromIceberg(schema), Lists.newArrayList(columnName), new Configuration()); committer.consumeData(Lists.newArrayList(d1)); committer.commit(); Table table = new HadoopTables(new Configuration()).load(tableFolder.getPath()); for (FileScanTask fileScanTask : table.newScan().planFiles()) { StructLike structLike = fileScanTask.file().partition(); if (expectedClass == ByteBuffer.class) { Assert.assertEquals(structLike.get(0, expectedClass).hashCode(), ByteBuffer.wrap((byte[])expectedValue).hashCode()); } else { Assert.assertTrue(structLike.get(0, expectedClass).equals(expectedValue)); } } } finally { tableFolder.delete(); } }
Example #28
Source File: IcebergInputFormat.java From iceberg with Apache License 2.0 | 5 votes |
private Map<Integer, ?> constantsMap(FileScanTask task, BiFunction<Type, Object, Object> converter) { PartitionSpec spec = task.spec(); Set<Integer> idColumns = spec.identitySourceIds(); Schema partitionSchema = TypeUtil.select(expectedSchema, idColumns); boolean projectsIdentityPartitionColumns = !partitionSchema.columns().isEmpty(); if (projectsIdentityPartitionColumns) { return PartitionUtil.constantsMap(task, converter); } else { return Collections.emptyMap(); } }
Example #29
Source File: IcebergSplitSource.java From presto with Apache License 2.0 | 5 votes |
private ConnectorSplit toIcebergSplit(FileScanTask task) { // TODO: We should leverage residual expression and convert that to TupleDomain. // The predicate here is used by readers for predicate push down at reader level, // so when we do not use residual expression, we are just wasting CPU cycles // on reader side evaluating a condition that we know will always be true. return new IcebergSplit( task.file().path().toString(), task.start(), task.length(), task.file().format(), ImmutableList.of(), getPartitionKeys(task)); }
Example #30
Source File: IcebergSplitSource.java From presto with Apache License 2.0 | 5 votes |
private static Map<Integer, String> getPartitionKeys(FileScanTask scanTask) { StructLike partition = scanTask.file().partition(); PartitionSpec spec = scanTask.spec(); Map<PartitionField, Integer> fieldToIndex = getIdentityPartitions(spec); Map<Integer, String> partitionKeys = new HashMap<>(); fieldToIndex.forEach((field, index) -> { int id = field.sourceId(); Type type = spec.schema().findType(id); Class<?> javaClass = type.typeId().javaClass(); Object value = partition.get(index, javaClass); if (value == null) { partitionKeys.put(id, null); } else { String partitionValue; if (type.typeId() == FIXED || type.typeId() == BINARY) { // this is safe because Iceberg PartitionData directly wraps the byte array partitionValue = new String(((ByteBuffer) value).array(), UTF_8); } else { partitionValue = value.toString(); } partitionKeys.put(id, partitionValue); } }); return Collections.unmodifiableMap(partitionKeys); }