org.apache.spark.sql.sources.v2.reader.DataSourceReader Java Examples
The following examples show how to use
org.apache.spark.sql.sources.v2.reader.DataSourceReader.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedTimestampFilter() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00")); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9), read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); }
Example #2
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("id", i)); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } }
Example #3
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testInFilterForTimestamp() { File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, new In("ts", new Timestamp[]{ new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000), new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000), new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000), new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000), null })); Assert.assertEquals("Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size()); }
Example #4
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 6 votes |
@Override public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) { Configuration conf = new Configuration(lazyBaseConf()); Table table = getTableAndResolveHadoopConfiguration(options, conf); String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive"); Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table)); Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption()); Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options); if (readSchema != null) { // convert() will fail if readSchema contains fields not in table.schema() SparkSchemaUtil.convert(table.schema(), readSchema); reader.pruneColumns(readSchema); } return reader; }
Example #5
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, EqualTo.apply("id", i)); List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } }
Example #6
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedTimestampFilter() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00")); List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9), read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); }
Example #7
Source File: BigQueryDataSourceV2.java From spark-bigquery-connector with Apache License 2.0 | 5 votes |
@Override public DataSourceReader createReader(StructType schema, DataSourceOptions options) { SparkSession spark = getDefaultSparkSessionOrCreate(); Injector injector = Guice.createInjector( new BigQueryClientModule(), new SparkBigQueryConnectorModule(spark, options, Optional.ofNullable(schema))); BigQueryDataSourceReader reader = injector.getInstance(BigQueryDataSourceReader.class); return reader; }
Example #8
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testBucketPartitionedIDFilters() { File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, planTasks(unfiltered).size()); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("id", i)); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); // validate predicate push-down Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i)); } }
Example #9
Source File: HiveWarehouseConnector.java From spark-llap with Apache License 2.0 | 5 votes |
@Override public DataSourceReader createReader(DataSourceOptions options) { try { return getDataSourceReader(getOptions(options)); } catch (IOException e) { LOG.error("Error creating {}", getClass().getName()); LOG.error(ExceptionUtils.getStackTrace(e)); throw new RuntimeException(e); } }
Example #10
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testPartitionedByIdStartsWith() { File location = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, new StringStartsWith("data", "junc")); Assert.assertEquals(1, reader.planInputPartitions().size()); }
Example #11
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testPartitionedByDataStartsWithFilter() { File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, new StringStartsWith("data", "junc")); Assert.assertEquals(1, reader.planInputPartitions().size()); }
Example #12
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testInFilter() { File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, new In("data", new String[]{"foo", "junction", "brush", null})); Assert.assertEquals(2, reader.planInputPartitions().size()); }
Example #13
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testBucketPartitionedIDFilters() { File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, unfiltered.planInputPartitions().size()); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, EqualTo.apply("id", i)); List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); // validate predicate push-down Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i)); } }
Example #14
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testUnpartitionedCaseInsensitiveIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); // set spark.sql.caseSensitive to false String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false"); try { IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } } finally { // return global conf to previous state TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest); } }
Example #15
Source File: PartitioningRowDataSource.java From spark-data-sources with MIT License | 5 votes |
/** * Spark calls this to create the reader. Notice how it pulls the host and port * on which ExampleDB is listening, as well as a table name, from the supplied options. * @param options * @return */ @Override public DataSourceReader createReader(DataSourceOptions options) { String host = options.get("host").orElse("localhost"); int port = options.getInt("port", -1); String table = options.get("table").orElse("unknownTable"); // TODO: throw int partitions = Integer.parseInt(options.get("partitions").orElse("0")); return new Reader(host, port, table, partitions); }
Example #16
Source File: ParallelRowDataSource.java From spark-data-sources with MIT License | 5 votes |
/** * Spark calls this to create the reader. Notice how it pulls the host and port * on which ExampleDB is listening, as well as a table name, from the supplied options. * @param options * @return */ @Override public DataSourceReader createReader(DataSourceOptions options) { String host = options.get("host").orElse("localhost"); int port = options.getInt("port", -1); String table = options.get("table").orElse("unknownTable"); // TODO: throw int partitions = Integer.parseInt(options.get("partitions").orElse("0")); return new Reader(host, port, table, partitions); }
Example #17
Source File: FlexibleRowDataSource.java From spark-data-sources with MIT License | 5 votes |
/** * Spark calls this to create the reader. Notice how it pulls the host and port * on which ExampleDB is listening, as well as a table name, from the supplied options. * @param options * @return */ @Override public DataSourceReader createReader(DataSourceOptions options) { String host = options.get("host").orElse("localhost"); int port = options.getInt("port", -1); String table = options.get("table").orElse("unknownTable"); // TODO: throw return new Reader(host, port, table); }
Example #18
Source File: ParallelRowReadWriteDataSource.java From spark-data-sources with MIT License | 5 votes |
/** * Spark calls this to create the reader. Notice how it pulls the host and port * on which ExampleDB is listening, as well as a table name, from the supplied options. * @param options * @return */ @Override public DataSourceReader createReader(DataSourceOptions options) { String host = options.get("host").orElse("localhost"); int port = options.getInt("port", -1); String table = options.get("table").orElse("unknownTable"); // TODO: throw int partitions = Integer.parseInt(options.get("partitions").orElse("0")); return new Reader(host, port, table, partitions); }
Example #19
Source File: SimpleRowDataSource.java From spark-data-sources with MIT License | 5 votes |
/** * Spark calls this to create the reader. Notice how it pulls the host and port * on which ExampleDB is listening from the supplied options. * @param options * @return */ @Override public DataSourceReader createReader(DataSourceOptions options) { String host = options.get("host").orElse("localhost"); int port = options.getInt("port", -1); return new Reader(host, port); }
Example #20
Source File: DefaultSource.java From flight-spark-source with Apache License 2.0 | 5 votes |
public DataSourceReader createReader(DataSourceOptions dataSourceOptions) { Location defaultLocation = Location.forGrpcInsecure( dataSourceOptions.get("host").orElse("localhost"), dataSourceOptions.getInt("port", 47470) ); String sql = dataSourceOptions.get("path").orElse(""); FlightDataSourceReader.FactoryOptions options = new FlightDataSourceReader.FactoryOptions( defaultLocation, sql, dataSourceOptions.get("username").orElse("anonymous"), dataSourceOptions.get("password").orElse(null), dataSourceOptions.getBoolean("parallel", false), null); Broadcast<FlightDataSourceReader.FactoryOptions> bOptions = lazySparkContext().broadcast(options); return new FlightDataSourceReader(bOptions); }
Example #21
Source File: SimpleMockConnector.java From spark-llap with Apache License 2.0 | 4 votes |
@Override public DataSourceReader createReader(DataSourceOptions options) { return new SimpleMockDataSourceReader(); }
Example #22
Source File: HiveWarehouseConnector.java From spark-llap with Apache License 2.0 | 4 votes |
protected DataSourceReader getDataSourceReader(Map<String, String> params) throws IOException { return new HiveWarehouseDataSourceReader(params); }
Example #23
Source File: MockHiveWarehouseConnector.java From spark-llap with Apache License 2.0 | 4 votes |
@Override protected DataSourceReader getDataSourceReader(Map<String, String> params) throws IOException { return new MockHiveWarehouseDataSourceReader(params); }
Example #24
Source File: DatasetSourceBatch.java From beam with Apache License 2.0 | 4 votes |
@Override public DataSourceReader createReader(DataSourceOptions options) { return new DatasetReader<>(options); }
Example #25
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 4 votes |
private List<DataReaderFactory<UnsafeRow>> planTasks(DataSourceReader reader) { Assert.assertTrue(reader instanceof SupportsScanUnsafeRow); SupportsScanUnsafeRow unsafeReader = (SupportsScanUnsafeRow) reader; return unsafeReader.createUnsafeRowReaderFactories(); }
Example #26
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 4 votes |
private void pushFilters(DataSourceReader reader, Expression... expressions) { Assert.assertTrue(reader instanceof SupportsPushDownCatalystFilters); SupportsPushDownCatalystFilters filterable = (SupportsPushDownCatalystFilters) reader; filterable.pushCatalystFilters(expressions); }
Example #27
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 4 votes |
@Override public DataSourceReader createReader(DataSourceOptions options) { Table table = findTable(options); return new Reader(table, lazyConf()); }
Example #28
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 4 votes |
private void pushFilters(DataSourceReader reader, Filter... filters) { Assert.assertTrue(reader instanceof SupportsPushDownFilters); SupportsPushDownFilters filterable = (SupportsPushDownFilters) reader; filterable.pushFilters(filters); }
Example #29
Source File: IcebergSource.java From iceberg with Apache License 2.0 | 4 votes |
@Override public DataSourceReader createReader(DataSourceOptions options) { return createReader(null, options); }
Example #30
Source File: BigQueryDataSourceV2.java From spark-bigquery-connector with Apache License 2.0 | 4 votes |
@Override public DataSourceReader createReader(DataSourceOptions options) { return createReader(null, options); }