org.apache.spark.sql.vectorized.ColumnarBatch Java Examples

The following examples show how to use org.apache.spark.sql.vectorized.ColumnarBatch. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: TestHelpers.java    From iceberg with Apache License 2.0 6 votes vote down vote up
public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch,
                                     boolean checkArrowValidityVector) {
  for (int rowId = 0; rowId < batch.numRows(); rowId++) {
    List<Types.NestedField> fields = struct.fields();
    InternalRow row = batch.getRow(rowId);
    Record rec = expected.next();
    for (int i = 0; i < fields.size(); i += 1) {
      Type fieldType = fields.get(i).type();
      Object expectedValue = rec.get(i);
      Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));
      assertEqualsUnsafe(fieldType, expectedValue, actualValue);

      if (checkArrowValidityVector) {
        ColumnVector columnVector = batch.column(i);
        ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector();
        Assert.assertEquals("Nullability doesn't match", expectedValue == null, arrowVector.isNull(rowId));
      }
    }
  }
}
 
Example #2
Source File: HiveWarehouseDataReaderFactory.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
@Override
public DataReader<ColumnarBatch> createDataReader() {
    LlapInputSplit llapInputSplit = new LlapInputSplit();
    ByteArrayInputStream splitByteArrayStream = new ByteArrayInputStream(splitBytes);
    ByteArrayInputStream confByteArrayStream = new ByteArrayInputStream(confBytes);
    JobConf conf = new JobConf();

    try(DataInputStream splitByteData = new DataInputStream(splitByteArrayStream);
        DataInputStream confByteData = new DataInputStream(confByteArrayStream)) {
        llapInputSplit.readFields(splitByteData);
        conf.readFields(confByteData);
        return getDataReader(llapInputSplit, conf, arrowAllocatorMax);
    } catch (Exception e) {
        throw new RuntimeException(e);
    }
}
 
Example #3
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>();
  try {
    JobConf jobConf = JobUtil.createJobConf(options, query);
    LlapBaseInputFormat llapInputFormat = new LlapBaseInputFormat(false, Long.MAX_VALUE);
    //numSplits arg not currently supported, use 1 as dummy arg
    InputSplit[] splits = llapInputFormat.getSplits(jobConf, 1);
    for (InputSplit split : splits) {
      tasks.add(getDataReaderFactory(split, jobConf, getArrowAllocatorMax()));
    }
  } catch (IOException e) {
    LOG.error("Unable to submit query to HS2");
    throw new RuntimeException(e);
  }
  return tasks;
}
 
Example #4
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
@Override public List<DataReaderFactory<ColumnarBatch>> createBatchDataReaderFactories() {
  try {
    boolean countStar = this.schema.length() == 0;
    String queryString = getQueryString(SchemaUtil.columnNames(schema), pushedFilters);
    List<DataReaderFactory<ColumnarBatch>> factories = new ArrayList<>();
    if (countStar) {
      LOG.info("Executing count with query: {}", queryString);
      factories.addAll(getCountStarFactories(queryString));
    } else {
      factories.addAll(getSplitsFactories(queryString));
    }
    return factories;
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}
 
Example #5
Source File: HiveWarehouseDataReader.java    From spark-llap with Apache License 2.0 6 votes vote down vote up
@Override public ColumnarBatch get() {
  //Spark asks you to convert one column at a time so that different
  //column types can be handled differently.
  //NumOfCols << NumOfRows so this is negligible
  List<FieldVector> fieldVectors = wrapperWritable.getVectorSchemaRoot().getFieldVectors();
  if(columnVectors == null) {
    //Lazy create ColumnarBatch/ColumnVector[] instance
    columnVectors = new ColumnVector[fieldVectors.size()];
    columnarBatch = new ColumnarBatch(columnVectors);
  }
  Iterator<FieldVector> iterator = fieldVectors.iterator();
  int rowCount = -1;
  for (int i = 0; i < columnVectors.length; i++) {
    FieldVector fieldVector = iterator.next();
    columnVectors[i] = new ArrowColumnVector(fieldVector);
    if (rowCount == -1) {
      //All column vectors have same length so we can get rowCount from any column
      rowCount = fieldVector.getValueCount();
    }
  }
  columnarBatch.setNumRows(rowCount);
  return columnarBatch;
}
 
Example #6
Source File: ArrowBinaryIterator.java    From spark-bigquery-connector with Apache License 2.0 6 votes vote down vote up
private Iterator<InternalRow> toArrowRows(VectorSchemaRoot root, List<String> namesInOrder) {
    ColumnVector[] columns = namesInOrder.stream()
            .map(name -> root.getVector(name))
            .map(vector -> new ArrowSchemaConverter(vector))
            .collect(Collectors.toList()).toArray(new ColumnVector[0]);

    ColumnarBatch batch = new ColumnarBatch(columns);
    batch.setNumRows(root.getRowCount());
    return batch.rowIterator();
}
 
Example #7
Source File: ColumnarBatchReader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
@Override
public final ColumnarBatch read(ColumnarBatch reuse, int numRowsToRead) {
  Preconditions.checkArgument(numRowsToRead > 0, "Invalid number of rows to read: %s", numRowsToRead);
  ColumnVector[] arrowColumnVectors = new ColumnVector[readers.length];

  if (reuse == null) {
    closeVectors();
  }

  for (int i = 0; i < readers.length; i += 1) {
    vectorHolders[i] = readers[i].read(vectorHolders[i], numRowsToRead);
    int numRowsInVector = vectorHolders[i].numValues();
    Preconditions.checkState(
        numRowsInVector == numRowsToRead,
        "Number of rows in the vector %s didn't match expected %s ", numRowsInVector,
        numRowsToRead);
    arrowColumnVectors[i] =
        IcebergArrowColumnVector.forHolder(vectorHolders[i], numRowsInVector);
  }
  ColumnarBatch batch = new ColumnarBatch(arrowColumnVectors);
  batch.setNumRows(numRowsToRead);
  return batch;
}
 
Example #8
Source File: Reader.java    From iceberg with Apache License 2.0 6 votes vote down vote up
/**
 * This is called in the Spark Driver when data is to be materialized into {@link ColumnarBatch}
 */
@Override
public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() {
  Preconditions.checkState(enableBatchRead(), "Batched reads not enabled");
  Preconditions.checkState(batchSize > 0, "Invalid batch size");
  String tableSchemaString = SchemaParser.toJson(table.schema());
  String expectedSchemaString = SchemaParser.toJson(lazySchema());
  String nameMappingString = table.properties().get(DEFAULT_NAME_MAPPING);

  List<InputPartition<ColumnarBatch>> readTasks = Lists.newArrayList();
  for (CombinedScanTask task : tasks()) {
    readTasks.add(new ReadTask<>(
        task, tableSchemaString, expectedSchemaString, nameMappingString, io, encryptionManager, caseSensitive,
        localityPreferred, new BatchReaderFactory(batchSize)));
  }
  LOG.info("Batching input partitions with {} tasks.", readTasks.size());

  return readTasks;
}
 
Example #9
Source File: BatchDataReader.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
CloseableIterator<ColumnarBatch> open(FileScanTask task) {
  CloseableIterable<ColumnarBatch> iter;
  InputFile location = getInputFile(task);
  Preconditions.checkNotNull(location, "Could not find InputFile associated with FileScanTask");
  if (task.file().format() == FileFormat.PARQUET) {
    Parquet.ReadBuilder builder = Parquet.read(location)
        .project(expectedSchema)
        .split(task.start(), task.length())
        .createBatchedReaderFunc(fileSchema -> VectorizedSparkParquetReaders.buildReader(expectedSchema,
            fileSchema, /* setArrowValidityVector */ NullCheckingForGet.NULL_CHECKING_ENABLED))
        .recordsPerBatch(batchSize)
        .filter(task.residual())
        .caseSensitive(caseSensitive)
        // Spark eagerly consumes the batches. So the underlying memory allocated could be reused
        // without worrying about subsequent reads clobbering over each other. This improves
        // read performance as every batch read doesn't have to pay the cost of allocating memory.
        .reuseContainers();

    if (nameMapping != null) {
      builder.withNameMapping(NameMappingParser.fromJson(nameMapping));
    }

    iter = builder.build();
  } else {
    throw new UnsupportedOperationException(
        "Format: " + task.file().format() + " not supported for batched reads");
  }
  return iter.iterator();
}
 
Example #10
Source File: TestParquetVectorizedReads.java    From iceberg with Apache License 2.0 5 votes vote down vote up
private void assertRecordsMatch(
    Schema schema, int expectedSize, Iterable<GenericData.Record> expected, File testFile,
    boolean setAndCheckArrowValidityBuffer, boolean reuseContainers)
    throws IOException {
  Parquet.ReadBuilder readBuilder = Parquet.read(Files.localInput(testFile))
      .project(schema)
      .recordsPerBatch(10000)
      .createBatchedReaderFunc(type -> VectorizedSparkParquetReaders.buildReader(
          schema,
          type,
          setAndCheckArrowValidityBuffer));
  if (reuseContainers) {
    readBuilder.reuseContainers();
  }
  try (CloseableIterable<ColumnarBatch> batchReader =
      readBuilder.build()) {
    Iterator<GenericData.Record> expectedIter = expected.iterator();
    Iterator<ColumnarBatch> batches = batchReader.iterator();
    int numRowsRead = 0;
    while (batches.hasNext()) {
      ColumnarBatch batch = batches.next();
      numRowsRead += batch.numRows();
      TestHelpers.assertEqualsBatch(schema.asStruct(), expectedIter, batch, setAndCheckArrowValidityBuffer);
    }
    Assert.assertEquals(expectedSize, numRowsRead);
  }
}
 
Example #11
Source File: SparkBatchScan.java    From iceberg with Apache License 2.0 5 votes vote down vote up
@Override
public PartitionReader<ColumnarBatch> createColumnarReader(InputPartition partition) {
  if (partition instanceof ReadTask) {
    return new BatchReader((ReadTask) partition, batchSize);
  } else {
    throw new UnsupportedOperationException("Incorrect input partition type: " + partition);
  }
}
 
Example #12
Source File: CountDataReader.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
@Override public ColumnarBatch get() {
  int size = (numRows >= 1000) ? 1000 : (int) numRows;
  OnHeapColumnVector vector = new OnHeapColumnVector(size, DataTypes.LongType);
  for(int i = 0; i < size; i++) {
    vector.putLong(0, numRows);
  }
  numRows -= size;
  ColumnarBatch batch = new ColumnarBatch(new ColumnVector[] {vector});
  batch.setNumRows(size);
  return batch;
}
 
Example #13
Source File: FlightDataReader.java    From flight-spark-source with Apache License 2.0 5 votes vote down vote up
@Override
public ColumnarBatch get() {
  start();
  ColumnarBatch batch = new ColumnarBatch(
    stream.getRoot().getFieldVectors()
      .stream()
      .map(FlightArrowColumnVector::new)
      .toArray(ColumnVector[]::new)
  );
  batch.setNumRows(stream.getRoot().getRowCount());
  return batch;
}
 
Example #14
Source File: FlightDataSourceReader.java    From flight-spark-source with Apache License 2.0 5 votes vote down vote up
private List<InputPartition<ColumnarBatch>> planBatchInputPartitionsSerial(FlightInfo info) {
  LOGGER.warn("planning partitions for endpoints {}", Joiner.on(", ").join(info.getEndpoints().stream().map(e -> e.getLocations().get(0).getUri().toString()).collect(Collectors.toList())));
  List<InputPartition<ColumnarBatch>> batches = info.getEndpoints().stream().map(endpoint -> {
    Location location = (endpoint.getLocations().isEmpty()) ?
      Location.forGrpcInsecure(defaultLocation.getUri().getHost(), defaultLocation.getUri().getPort()) :
      endpoint.getLocations().get(0);
    FactoryOptions options = dataSourceOptions.value().copy(location, endpoint.getTicket().getBytes());
    LOGGER.warn("X1 {}", dataSourceOptions.value());
    return new FlightDataReaderFactory(lazySparkContext().broadcast(options));
  }).collect(Collectors.toList());
  LOGGER.info("Created {} batches from arrow endpoints", batches.size());
  return batches;
}
 
Example #15
Source File: FlightDataSourceReader.java    From flight-spark-source with Apache License 2.0 5 votes vote down vote up
private List<InputPartition<ColumnarBatch>> planBatchInputPartitionsParallel() {

    try (FlightClient client = clientFactory.apply()) {
      FlightInfo info = client.getInfo(FlightDescriptor.command(sql.getBytes()));
      return planBatchInputPartitionsSerial(info);
    } catch (InterruptedException e) {
      throw new RuntimeException(e);
    }
  }
 
Example #16
Source File: MockHiveWarehouseConnector.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
@Override
public DataReader<ColumnarBatch> createDataReader() {
  try {
    return getDataReader(null, new JobConf(), Long.MAX_VALUE);
  } catch (Exception e) {
    throw new RuntimeException(e);
  }
}
 
Example #17
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 5 votes vote down vote up
private List<DataReaderFactory<ColumnarBatch>> getCountStarFactories(String query) {
  List<DataReaderFactory<ColumnarBatch>> tasks = new ArrayList<>(100);
  long count = getCount(query);
  String numTasksString = HWConf.COUNT_TASKS.getFromOptionsMap(options);
  int numTasks = Integer.parseInt(numTasksString);
  long numPerTask = count/(numTasks - 1);
  long numLastTask = count % (numTasks - 1);
  for(int i = 0; i < (numTasks - 1); i++) {
    tasks.add(new CountDataReaderFactory(numPerTask));
  }
  tasks.add(new CountDataReaderFactory(numLastTask));
  return tasks;
}
 
Example #18
Source File: MockHiveWarehouseConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
@Override
protected DataReader<ColumnarBatch> getDataReader(LlapInputSplit split, JobConf jobConf, long arrowAllocatorMax)
    throws Exception {
  return new MockHiveWarehouseDataReader(split, jobConf, arrowAllocatorMax);
}
 
Example #19
Source File: CountDataReaderFactory.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
@Override
public DataReader<ColumnarBatch> createDataReader() {
  return new CountDataReader(numRows);
}
 
Example #20
Source File: MockHiveWarehouseConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
@Override
protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) {
  return new MockHiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax);
}
 
Example #21
Source File: MockHiveWarehouseConnector.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
protected List<DataReaderFactory<ColumnarBatch>> getSplitsFactories(String query) {
  return Lists.newArrayList(new MockHiveWarehouseDataReaderFactory(null, null, 0));
}
 
Example #22
Source File: HiveWarehouseDataReaderFactory.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
protected DataReader<ColumnarBatch> getDataReader(LlapInputSplit split, JobConf jobConf, long arrowAllocatorMax)
    throws Exception {
    return new HiveWarehouseDataReader(split, jobConf, arrowAllocatorMax);
}
 
Example #23
Source File: HiveWarehouseDataSourceReader.java    From spark-llap with Apache License 2.0 4 votes vote down vote up
protected DataReaderFactory<ColumnarBatch> getDataReaderFactory(InputSplit split, JobConf jobConf, long arrowAllocatorMax) {
  return new HiveWarehouseDataReaderFactory(split, jobConf, arrowAllocatorMax);
}
 
Example #24
Source File: Reader.java    From iceberg with Apache License 2.0 4 votes vote down vote up
@Override
public InputPartitionReader<ColumnarBatch> create(CombinedScanTask task, Schema tableSchema, Schema expectedSchema,
                                                  String nameMapping, FileIO io,
                                                  EncryptionManager encryptionManager, boolean caseSensitive) {
  return new BatchReader(task, expectedSchema, nameMapping, io, encryptionManager, caseSensitive, batchSize);
}
 
Example #25
Source File: FlightDataReaderFactory.java    From flight-spark-source with Apache License 2.0 4 votes vote down vote up
@Override
public InputPartitionReader<ColumnarBatch> createPartitionReader() {
  return new FlightDataReader(options);
}
 
Example #26
Source File: FlightDataSourceReader.java    From flight-spark-source with Apache License 2.0 4 votes vote down vote up
@Override
public List<InputPartition<ColumnarBatch>> planBatchInputPartitions() {
  return planBatchInputPartitionsParallel();
}