org.apache.spark.sql.sources.v2.reader.DataReaderFactory Java Exaples

Source File: ParallelRowReadWriteDataSource.java From spark-data-sources with MIT License

6 votes

@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    List<Split> splits = null;
    DBClientWrapper db = new DBClientWrapper(_host, _port);
    db.connect();
    try {
        if (_partitions == 0)
            splits = db.getSplits(_table);
        else
            splits = db.getSplits(_table, _partitions);
    } catch (UnknownTableException ute) {
        throw new RuntimeException(ute);
    } finally {
        db.disconnect();
    }
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    log.info("created " + factories.size() + " factories");
    return factories;
}

Source File: ParallelRowDataSource.java From spark-data-sources with MIT License

6 votes

@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    List<Split> splits = null;
    DBClientWrapper db = new DBClientWrapper(_host, _port);
    db.connect();
    try {
        if (_partitions == 0)
            splits = db.getSplits(_table);
        else
            splits = db.getSplits(_table, _partitions);
    } catch (UnknownTableException ute) {
        throw new RuntimeException(ute);
    } finally {
        db.disconnect();
    }
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    log.info("created " + factories.size() + " factories");
    return factories;
}

Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0

6 votes

protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>();
  try {
    JobConf jobConf = JobUtil.createJobConf(options, query);
    LlapBaseInputFormat llapInputFormat = new LlapBaseInputFormat(false, Long.MAX_VALUE);
    //numSplits arg not currently supported, use 1 as dummy arg
    InputSplit[] splits = llapInputFormat.getSplits(jobConf, 1);
    for (InputSplit split : splits) {
      tasks.add(getDataReaderFactory(split, jobConf, getArrowAllocatorMax()));
    }
  } catch (IOException e) {
    LOG.error("Unable to submit query to HS2");
    throw new RuntimeException(e);
  }
  return tasks;
}

Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0

6 votes

@Override public List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories() {
  try {
    boolean countStar = this.schema.length() == 0;
    String queryString = getQueryString(SchemaUtil.columnNames(schema), pushedFilters);
    List<DataReaderFactory<ColumnarBatch>> factories = new ArrayList<>();
    if (countStar) {
      LOG.info("Executing count with query: {}", queryString);
      factories.addAll(getCountStarFactories(queryString));
    } else {
      factories.addAll(getSplitsFactories(queryString));
    }
    return factories;
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

@Test
public void testUnpartitionedTimestampFilter() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  DataSourceReader reader = source.createReader(options);

  pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00"));

  List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);
  Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

  assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9),
      read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)"));
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testBucketPartitionedIDFilters() {
  File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id");

  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", location.toString())
  );

  IcebergSource source = new IcebergSource();
  DataSourceReader unfiltered = source.createReader(options);
  Assert.assertEquals("Unfiltered table should created 4 read tasks",
      4, planTasks(unfiltered).size());

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, Expressions.equal("id", i));

    List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader);

    // validate predicate push-down
    Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i));
  }
}

Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0

5 votes

private List<DataReaderFactory<ColumnarBatch>> getCountStarFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>(100);
  long count = getCount(query);
  String numTasksString = HWConf.COUNT_TASKS.getFromOptionsMap(options);
  int numTasks = Integer.parseInt(numTasksString);
  long numPerTask = count/(numTasks - 1);
  long numLastTask = count % (numTasks - 1);
  for(int i = 0; i < (numTasks - 1); i++) {
    tasks.add(new CountDataReaderFactory(numPerTask));
  }
  tasks.add(new CountDataReaderFactory(numLastTask));
  return tasks;
}

Source File: Reader.java From iceberg with Apache License 2.0

5 votes

@Override
public List<DataReaderFactory<UnsafeRow>> createUnsafeRowReaderFactories() {
  String tableSchemaString = SchemaParser.toJson(table.schema());
  String expectedSchemaString = SchemaParser.toJson(lazySchema());

  List<DataReaderFactory<UnsafeRow>> readTasks = Lists.newArrayList();
  for (CombinedScanTask task : tasks()) {
    readTasks.add(new ReadTask(task, tableSchemaString, expectedSchemaString, conf));
  }

  return readTasks;
}

Source File: PartitioningRowDataSource.java From spark-data-sources with MIT License

5 votes

@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("reader factories requested for table [" + _table + "]");
    initialize();
    List<DataReaderFactory<Row>> factories = new ArrayList<>();
    for (Split split : _splits) {
        DataReaderFactory<Row> factory =
                new SplitDataReaderFactory(_host, _port, _table, readSchema(), split);
        factories.add(factory);
    }
    return factories;
}

Source File: SimpleRowDataSource.java From spark-data-sources with MIT License

4 votes

@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("creating a single factory");
    return java.util.Arrays.asList(new SimpleDataReaderFactory(_host, _port));
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

4 votes

private List<DataReaderFactory<UnsafeRow>> planTasks(DataSourceReader reader) {
  Assert.assertTrue(reader instanceof SupportsScanUnsafeRow);
  SupportsScanUnsafeRow unsafeReader = (SupportsScanUnsafeRow) reader;
  return unsafeReader.createUnsafeRowReaderFactories();
}

Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0

4 votes

protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) {
  return new HiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax);
}

Source File: FlexibleRowDataSource.java From spark-data-sources with MIT License

4 votes

@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    log.info("creating a single factory");
    return java.util.Collections.singletonList(
            new SimpleDataReaderFactory(_host, _port, _table, readSchema()));
}

Source File: SimpleMockConnector.java From spark-llap with Apache License 2.0

4 votes

@Override
public List<DataReaderFactory<Row>> createDataReaderFactories() {
    return Arrays.asList(new SimpleMockDataReaderFactory());
}

Source File: MockHiveWarehouseConnector.java From spark-llap with Apache License 2.0

4 votes

@Override
protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) {
  return new MockHiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax);
}

Source File: MockHiveWarehouseConnector.java From spark-llap with Apache License 2.0

4 votes

protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  return Lists.newArrayList(new MockHiveWarehouseDataReaderFactory(null, null, 0));
}

org.apache.spark.sql.sources.v2.reader.DataReaderFactory Java Examples