org.apache.spark.sql.sources.v2.reader.DataReaderFactory Java Examples
The following examples show how to use
org.apache.spark.sql.sources.v2.reader.DataReaderFactory.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ParallelRowReadWriteDataSource.java From spark-data-sources with MIT License | 6 votes |
@Override public List<DataReaderFactory<Row>> createDataReaderFactories() { List<Split> splits = null; DBClientWrapper db = new DBClientWrapper(_host, _port); db.connect(); try { if (_partitions == 0) splits = db.getSplits(_table); else splits = db.getSplits(_table, _partitions); } catch (UnknownTableException ute) { throw new RuntimeException(ute); } finally { db.disconnect(); } List<DataReaderFactory<Row>> factories = new ArrayList<>(); for (Split split : splits) { DataReaderFactory<Row> factory = new SplitDataReaderFactory(_host, _port, _table, readSchema(), split); factories.add(factory); } log.info("created " + factories.size() + " factories"); return factories; }
Example #2
Source File: ParallelRowDataSource.java From spark-data-sources with MIT License | 6 votes |
@Override public List<DataReaderFactory<Row>> createDataReaderFactories() { List<Split> splits = null; DBClientWrapper db = new DBClientWrapper(_host, _port); db.connect(); try { if (_partitions == 0) splits = db.getSplits(_table); else splits = db.getSplits(_table, _partitions); } catch (UnknownTableException ute) { throw new RuntimeException(ute); } finally { db.disconnect(); } List<DataReaderFactory<Row>> factories = new ArrayList<>(); for (Split split : splits) { DataReaderFactory<Row> factory = new SplitDataReaderFactory(_host, _port, _table, readSchema(), split); factories.add(factory); } log.info("created " + factories.size() + " factories"); return factories; }
Example #3
Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0 | 6 votes |
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) { List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>(); try { JobConf jobConf = JobUtil.createJobConf(options, query); LlapBaseInputFormat llapInputFormat = new LlapBaseInputFormat(false, Long.MAX_VALUE); //numSplits arg not currently supported, use 1 as dummy arg InputSplit[] splits = llapInputFormat.getSplits(jobConf, 1); for (InputSplit split : splits) { tasks.add(getDataReaderFactory(split, jobConf, getArrowAllocatorMax())); } } catch (IOException e) { LOG.error("Unable to submit query to HS2"); throw new RuntimeException(e); } return tasks; }
Example #4
Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0 | 6 votes |
@Override public List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories() { try { boolean countStar = this.schema.length() == 0; String queryString = getQueryString(SchemaUtil.columnNames(schema), pushedFilters); List<DataReaderFactory<ColumnarBatch>> factories = new ArrayList<>(); if (countStar) { LOG.info("Executing count with query: {}", queryString); factories.addAll(getCountStarFactories(queryString)); } else { factories.addAll(getSplitsFactories(queryString)); } return factories; } catch (Exception e) { throw new RuntimeException(e); } }
Example #5
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("id", i)); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } }
Example #6
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedTimestampFilter() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.lessThan("ts", "2017-12-22T00:00:00+00:00")); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); assertEqualsSafe(SCHEMA.asStruct(), expected(5,6,7,8,9), read(unpartitioned.toString(), "ts < cast('2017-12-22 00:00:00+00:00' as timestamp)")); }
Example #7
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testBucketPartitionedIDFilters() { File location = buildPartitionedTable("bucketed_by_id", BUCKET_BY_ID, "bucket4", "id"); DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", location.toString()) ); IcebergSource source = new IcebergSource(); DataSourceReader unfiltered = source.createReader(options); Assert.assertEquals("Unfiltered table should created 4 read tasks", 4, planTasks(unfiltered).size()); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, Expressions.equal("id", i)); List<DataReaderFactory<UnsafeRow>> tasks = planTasks(reader); // validate predicate push-down Assert.assertEquals("Should create one task for a single bucket", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(location.toString(), "id = " + i)); } }
Example #8
Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0 | 5 votes |
private List<DataReaderFactory<ColumnarBatch>> getCountStarFactories(String query) { List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>(100); long count = getCount(query); String numTasksString = HWConf.COUNT_TASKS.getFromOptionsMap(options); int numTasks = Integer.parseInt(numTasksString); long numPerTask = count/(numTasks - 1); long numLastTask = count % (numTasks - 1); for(int i = 0; i < (numTasks - 1); i++) { tasks.add(new CountDataReaderFactory(numPerTask)); } tasks.add(new CountDataReaderFactory(numLastTask)); return tasks; }
Example #9
Source File: Reader.java From iceberg with Apache License 2.0 | 5 votes |
@Override public List<DataReaderFactory<UnsafeRow>> createUnsafeRowReaderFactories() { String tableSchemaString = SchemaParser.toJson(table.schema()); String expectedSchemaString = SchemaParser.toJson(lazySchema()); List<DataReaderFactory<UnsafeRow>> readTasks = Lists.newArrayList(); for (CombinedScanTask task : tasks()) { readTasks.add(new ReadTask(task, tableSchemaString, expectedSchemaString, conf)); } return readTasks; }
Example #10
Source File: PartitioningRowDataSource.java From spark-data-sources with MIT License | 5 votes |
@Override public List<DataReaderFactory<Row>> createDataReaderFactories() { log.info("reader factories requested for table [" + _table + "]"); initialize(); List<DataReaderFactory<Row>> factories = new ArrayList<>(); for (Split split : _splits) { DataReaderFactory<Row> factory = new SplitDataReaderFactory(_host, _port, _table, readSchema(), split); factories.add(factory); } return factories; }
Example #11
Source File: SimpleRowDataSource.java From spark-data-sources with MIT License | 4 votes |
@Override public List<DataReaderFactory<Row>> createDataReaderFactories() { log.info("creating a single factory"); return java.util.Arrays.asList(new SimpleDataReaderFactory(_host, _port)); }
Example #12
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 4 votes |
private List<DataReaderFactory<UnsafeRow>> planTasks(DataSourceReader reader) { Assert.assertTrue(reader instanceof SupportsScanUnsafeRow); SupportsScanUnsafeRow unsafeReader = (SupportsScanUnsafeRow) reader; return unsafeReader.createUnsafeRowReaderFactories(); }
Example #13
Source File: HiveWarehouseDataSourceReader.java From spark-llap with Apache License 2.0 | 4 votes |
protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) { return new HiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax); }
Example #14
Source File: FlexibleRowDataSource.java From spark-data-sources with MIT License | 4 votes |
@Override public List<DataReaderFactory<Row>> createDataReaderFactories() { log.info("creating a single factory"); return java.util.Collections.singletonList( new SimpleDataReaderFactory(_host, _port, _table, readSchema())); }
Example #15
Source File: SimpleMockConnector.java From spark-llap with Apache License 2.0 | 4 votes |
@Override public List<DataReaderFactory<Row>> createDataReaderFactories() { return Arrays.asList(new SimpleMockDataReaderFactory()); }
Example #16
Source File: MockHiveWarehouseConnector.java From spark-llap with Apache License 2.0 | 4 votes |
@Override protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) { return new MockHiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax); }
Example #17
Source File: MockHiveWarehouseConnector.java From spark-llap with Apache License 2.0 | 4 votes |
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) { return Lists.newArrayList(new MockHiveWarehouseDataReaderFactory(null, null, 0)); }