org.apache.spark.sql.sources.v2.reader.DataSourceReader Java Exaples

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00"));

  List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

@Test
public void testInFilterForTimestamp() {
  File location = buildPartitionedTable("partitioned_by_hour", PARTITION_BY_HOUR, "ts_hour", "ts");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("ts", new Timestamp[]{
      new Timestamp(instant("2017-12-22T00:00:00.123+00:00") / 1000),
      new Timestamp(instant("2017-12-22T09:20:44.294+00:00") / 1000),
      new Timestamp(instant("2017-12-22T00:34:00.184+00:00") / 1000),
      new Timestamp(instant("2017-12-21T15:15:16.230+00:00") / 1000),
      null
  }));

  Assert.assertEquals("Should create 1 task for 2017-12-21: 15", 1, reader.planInputPartitions().size());
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

6 votes

@Override
public DataSourceReader createReader(StructType readSchema, DataSourceOptions options) {
  Configuration conf = new Configuration(lazyBaseConf());
  Table table = getTableAndResolveHadoopConfiguration(options, conf);
  String caseSensitive = lazySparkSession().conf().get("spark.sql.caseSensitive");

  Broadcast<FileIO> io = lazySparkContext().broadcast(SparkUtil.serializableFileIO(table));
  Broadcast<EncryptionManager> encryptionManager = lazySparkContext().broadcast(table.encryption());

  Reader reader = new Reader(table, io, encryptionManager, Boolean.parseBoolean(caseSensitive), options);
  if (readSchema != null) {
    // convert() will fail if readSchema contains fields not in table.schema()
    SparkSchemaUtil.convert(table.schema(), readSchema);
    reader.pruneColumns(readSchema);
  }

  return reader;
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, LessThan.apply("ts", "2017-12-22T00:00:00+00:00"));

  List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5, 6, 7, 8, 9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}

Source File: BigQueryDataSourceV2.java From spark-bigquery-connector with Apache License 2.0

5 votes

@Override
public DataSourceReader createReader(StructType schema, DataSourceOptions options) {
    SparkSession spark = getDefaultSparkSessionOrCreate();

    Injector injector = Guice.createInjector(
            new BigQueryClientModule(),
            new SparkBigQueryConnectorModule(spark, options, Optional.ofNullable(schema)));

    BigQueryDataSourceReader reader = injector.getInstance(BigQueryDataSourceReader.class);
    return reader;
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, planTasks(unfiltered).size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}

Source File: HiveWarehouseConnector.java From spark-llap with Apache License 2.0

5 votes

@Override public DataSourceReader createReader(DataSourceOptions options) {
  try {
    return getDataSourceReader(getOptions(options));
  } catch (IOException e) {
    LOG.error("Error creating {}", getClass().getName());
    LOG.error(ExceptionUtils.getStackTrace(e));
    throw new RuntimeException(e);
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testPartitionedByIdStartsWith() {
  File location = buildPartitionedTable("partitioned_by_id", PARTITION_BY_ID, "id_ident", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new StringStartsWith("data", "junc"));

  Assert.assertEquals(1, reader.planInputPartitions().size());
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testPartitionedByDataStartsWithFilter() {
  File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new StringStartsWith("data", "junc"));

  Assert.assertEquals(1, reader.planInputPartitions().size());
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testInFilter() {
  File location = buildPartitionedTable("partitioned_by_data", PARTITION_BY_DATA, "data_ident", "data");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader reader = source.createReader(options);
  pushFilters(reader, new In("data", new String[]{"foo", "junction", "brush", null}));

  Assert.assertEquals(2, reader.planInputPartitions().size());
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, unfiltered.planInputPartitions().size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  // set spark.sql.caseSensitive to false
  String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
  TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");

  try {
    IcebergSource source = new IcebergSource();

    for (int i = 0; i < 10; i += 1) {
      DataSourceReader reader = source.createReader(options);

      pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match

      List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
      Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

      // validate row filtering
      assertEqualsSafe(SCHEMA.asStruct(), expected(i),
          read(unpartitioned.toString(), "id = " + i));
    }
  } finally {
    // return global conf to previous state
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
  }
}

Source File: PartitioningRowDataSource.java From spark-data-sources with MIT License

5 votes

/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}

Source File: ParallelRowDataSource.java From spark-data-sources with MIT License

5 votes

/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}

Source File: FlexibleRowDataSource.java From spark-data-sources with MIT License

5 votes

/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    return new Reader(host, port, table);
}

Source File: ParallelRowReadWriteDataSource.java From spark-data-sources with MIT License

5 votes

/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening, as well as a table name, from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    String table = options.get("table").orElse("unknownTable"); // TODO: throw
    int partitions = Integer.parseInt(options.get("partitions").orElse("0"));
    return new Reader(host, port, table, partitions);
}

Source File: SimpleRowDataSource.java From spark-data-sources with MIT License

5 votes

/**
 * Spark calls this to create the reader. Notice how it pulls the host and port
 * on which ExampleDB is listening from the supplied options.
 * @param options
 * @return
 */
@Override
public DataSourceReader createReader(DataSourceOptions options) {
    String host = options.get("host").orElse("localhost");
    int port = options.getInt("port", -1);
    return new Reader(host, port);
}

Source File: DefaultSource.java From flight-spark-source with Apache License 2.0

5 votes

public DataSourceReader createReader(DataSourceOptions dataSourceOptions) {
  Location defaultLocation = Location.forGrpcInsecure(
    dataSourceOptions.get("host").orElse("localhost"),
    dataSourceOptions.getInt("port", 47470)
  );
  String sql = dataSourceOptions.get("path").orElse("");
  FlightDataSourceReader.FactoryOptions options = new FlightDataSourceReader.FactoryOptions(
    defaultLocation,
    sql,
    dataSourceOptions.get("username").orElse("anonymous"),
    dataSourceOptions.get("password").orElse(null),
    dataSourceOptions.getBoolean("parallel", false), null);
  Broadcast<FlightDataSourceReader.FactoryOptions> bOptions = lazySparkContext().broadcast(options);
  return new FlightDataSourceReader(bOptions);
}

Source File: SimpleMockConnector.java From spark-llap with Apache License 2.0

4 votes

@Override
public DataSourceReader createReader(DataSourceOptions options) {
    return new SimpleMockDataSourceReader();
}

Source File: HiveWarehouseConnector.java From spark-llap with Apache License 2.0

4 votes

protected DataSourceReader getDataSourceReader(Map<String, String> params) throws IOException {
  return new HiveWarehouseDataSourceReader(params);
}

Source File: MockHiveWarehouseConnector.java From spark-llap with Apache License 2.0

4 votes

@Override
protected DataSourceReader getDataSourceReader(Map<String, String> params) throws IOException {
  return new MockHiveWarehouseDataSourceReader(params);
}

Source File: DatasetSourceBatch.java From beam with Apache License 2.0

4 votes

@Override
public DataSourceReader createReader(DataSourceOptions options) {
  return new DatasetReader<>(options);
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

4 votes

private List<DataReaderFactory<UnsafeRow>> planTasks(DataSourceReader reader) {
  Assert.assertTrue(reader instanceof SupportsScanUnsafeRow);
  SupportsScanUnsafeRow unsafeReader = (SupportsScanUnsafeRow) reader;
  return unsafeReader.createUnsafeRowReaderFactories();
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

4 votes

private void pushFilters(DataSourceReader reader,
                         Expression... expressions) {
  Assert.assertTrue(reader instanceof SupportsPushDownCatalystFilters);
  SupportsPushDownCatalystFilters filterable = (SupportsPushDownCatalystFilters) reader;
  filterable.pushCatalystFilters(expressions);
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

4 votes

@Override
public DataSourceReader createReader(DataSourceOptions options) {
  Table table = findTable(options);
  return new Reader(table, lazyConf());
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

4 votes

private void pushFilters(DataSourceReader reader, Filter... filters) {
  Assert.assertTrue(reader instanceof SupportsPushDownFilters);
  SupportsPushDownFilters filterable = (SupportsPushDownFilters) reader;
  filterable.pushFilters(filters);
}

Source File: IcebergSource.java From iceberg with Apache License 2.0

4 votes

@Override
public DataSourceReader createReader(DataSourceOptions options) {
  return createReader(null, options);
}

Source File: BigQueryDataSourceV2.java From spark-bigquery-connector with Apache License 2.0

4 votes

@Override
public DataSourceReader createReader(DataSourceOptions options) {
    return createReader(null, options);
}

org.apache.spark.sql.sources.v2.reader.DataSourceReader Java Examples