org.apache.spark.sql.sources.v2.reader.InputPartition Java Examples
The following examples show how to use
org.apache.spark.sql.sources.v2.reader.InputPartition.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: Reader.java From iceberg with Apache License 2.0 | 6 votes |
/** * This is called in the Spark Driver when data is to be materialized into {@link ColumnarBatch} */ @Override public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() { Preconditions.checkState(enableBatchRead(), "Batched reads not enabled"); Preconditions.checkState(batchSize > 0, "Invalid batch size"); String tableSchemaString = SchemaParser.toJson(table.schema()); String expectedSchemaString = SchemaParser.toJson(lazySchema()); String nameMappingString = table.properties().get(DEFAULT_NAME_MAPPING); List<InputPartition<ColumnarBatch>> readTasks = Lists.newArrayList(); for (CombinedScanTask task : tasks()) { readTasks.add(new ReadTask<>( task, tableSchemaString, expectedSchemaString, nameMappingString, io, encryptionManager, caseSensitive, localityPreferred, new BatchReaderFactory(batchSize))); } LOG.info("Batching input partitions with {} tasks.", readTasks.size()); return readTasks; }
Example #2
Source File: Reader.java From iceberg with Apache License 2.0 | 6 votes |
/** * This is called in the Spark Driver when data is to be materialized into {@link InternalRow} */ @Override public List<InputPartition<InternalRow>> planInputPartitions() { String tableSchemaString = SchemaParser.toJson(table.schema()); String expectedSchemaString = SchemaParser.toJson(lazySchema()); String nameMappingString = table.properties().get(DEFAULT_NAME_MAPPING); List<InputPartition<InternalRow>> readTasks = Lists.newArrayList(); for (CombinedScanTask task : tasks()) { readTasks.add(new ReadTask<>( task, tableSchemaString, expectedSchemaString, nameMappingString, io, encryptionManager, caseSensitive, localityPreferred, InternalRowReaderFactory.INSTANCE)); } return readTasks; }
Example #3
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, EqualTo.apply("id", i)); List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } }
Example #4
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedTimestampFilter() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); }
Example #5
Source File: FlightDataSourceReader.java From flight-spark-source with Apache License 2.0 | 5 votes |
private List<InputPartition<ColumnarBatch>> planBatchInputPartitionsParallel() { try (FlightClient client = clientFactory.apply()) { FlightInfo info = client.getInfo(FlightDescriptor.command(sql.getBytes())); return planBatchInputPartitionsSerial(info); } catch (InterruptedException e) { throw new RuntimeException(e); } }
Example #6
Source File: FlightDataSourceReader.java From flight-spark-source with Apache License 2.0 | 5 votes |
private List<InputPartition<ColumnarBatch>> planBatchInputPartitionsSerial(FlightInfo info) { LOGGER.warn("planning partitions for endpoints {}", Joiner.on(", ").join(info.getEndpoints().stream().map(e -> e.getLocations().get(0).getUri().toString()).collect(Collectors.toList()))); List<InputPartition<ColumnarBatch>> batches = info.getEndpoints().stream().map(endpoint -> { Location location = (endpoint.getLocations().isEmpty()) ? Location.forGrpcInsecure(defaultLocation.getUri().getHost(), defaultLocation.getUri().getPort()) : endpoint.getLocations().get(0); FactoryOptions options = dataSourceOptions.value().copy(location, endpoint.getTicket().getBytes()); LOGGER.warn("X1 {}", dataSourceOptions.value()); return new FlightDataReaderFactory(lazySparkContext().broadcast(options)); }).collect(Collectors.toList()); LOGGER.info("Created {} batches from arrow endpoints", batches.size()); return batches; }
Example #7
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testUnpartitionedCaseInsensitiveIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); // set spark.sql.caseSensitive to false String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false"); try { IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } } finally { // return global conf to previous state TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest); } }
Example #8
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testBucketPartitionedIDFilters() { File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().size()); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, EqualTo.apply("id", i)); List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); // validate predicate push-down Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i)); } }
Example #9
Source File: FlightDataSourceReader.java From flight-spark-source with Apache License 2.0 | 4 votes |
@Override public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() { return planBatchInputPartitionsParallel(); }