org.apache.arrow.vector.ipc.message.ArrowRecordBatch Java Examples
The following examples show how to use
org.apache.arrow.vector.ipc.message.ArrowRecordBatch.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: ArrowRecordBatchSerDe.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
@Override protected ArrowRecordBatch doDeserialize(JsonParser jparser, DeserializationContext ctxt) throws IOException { if (jparser.nextToken() != JsonToken.VALUE_EMBEDDED_OBJECT) { throw new IllegalStateException("Expecting " + JsonToken.VALUE_STRING + " but found " + jparser.getCurrentLocation()); } byte[] bytes = jparser.getBinaryValue(); AtomicReference<ArrowRecordBatch> batch = new AtomicReference<>(); try { return blockAllocator.registerBatch((BufferAllocator root) -> { batch.set((ArrowRecordBatch) MessageSerializer.deserializeMessageBatch( new ReadChannel(Channels.newChannel(new ByteArrayInputStream(bytes))), root)); return batch.get(); }); } catch (Exception ex) { if (batch.get() != null) { batch.get().close(); } throw ex; } }
Example #2
Source File: ArrowRecordBatchLoader.java From dremio-oss with Apache License 2.0 | 6 votes |
public static void load(RecordBatch recordBatch, VectorAccessible vectorAccessible, ArrowBuf body) { List<Field> fields = vectorAccessible.getSchema().getFields(); List<FieldVector> fieldVectors = FluentIterable.from(vectorAccessible) .transform(new Function<VectorWrapper<?>, FieldVector>() { @Override public FieldVector apply(VectorWrapper<?> wrapper) { return (FieldVector) wrapper.getValueVector(); } }).toList(); try { ArrowRecordBatch arrowRecordBatch = deserializeRecordBatch(recordBatch, body); Iterator<ArrowFieldNode> nodes = arrowRecordBatch.getNodes().iterator(); Iterator<ArrowBuf> buffers = arrowRecordBatch.getBuffers().iterator(); for (int i = 0; i < fields.size(); ++i) { Field field = fields.get(i); FieldVector fieldVector = fieldVectors.get(i); loadBuffers(fieldVector, field, buffers, nodes); } if (buffers.hasNext()) { throw new IllegalArgumentException("not all buffers were consumed. " + buffers); } } catch (IOException e) { throw new RuntimeException("could not deserialize batch for " + vectorAccessible.getSchema(), e); } }
Example #3
Source File: ArrowUtils.java From konduit-serving with Apache License 2.0 | 6 votes |
public static Pair<Schema, ArrowWritableRecordBatch> readFromBytes(byte[] input) throws IOException { BufferAllocator allocator = new RootAllocator(9223372036854775807L); Schema retSchema = null; ArrowWritableRecordBatch ret = null; SeekableReadChannel channel = new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(input)); ArrowFileReader reader = new ArrowFileReader(channel, allocator); reader.loadNextBatch(); retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema()); VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot()); VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot()); ArrowRecordBatch recordBatch = unloader.getRecordBatch(); vectorLoader.load(recordBatch); ret = asDataVecBatch(recordBatch, retSchema, reader.getVectorSchemaRoot()); ret.setUnloader(unloader); return Pair.of(retSchema, ret); }
Example #4
Source File: ArrowUtils.java From konduit-serving with Apache License 2.0 | 6 votes |
public static Pair<Schema, ArrowWritableRecordBatch> readFromFile(FileInputStream input) throws IOException { BufferAllocator allocator = new RootAllocator(9223372036854775807L); Schema retSchema = null; ArrowWritableRecordBatch ret = null; SeekableReadChannel channel = new SeekableReadChannel(input.getChannel()); ArrowFileReader reader = new ArrowFileReader(channel, allocator); reader.loadNextBatch(); retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema()); VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot()); VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot()); ArrowRecordBatch recordBatch = unloader.getRecordBatch(); vectorLoader.load(recordBatch); ret = asDataVecBatch(recordBatch, retSchema, reader.getVectorSchemaRoot()); ret.setUnloader(unloader); return Pair.of(retSchema, ret); }
Example #5
Source File: Stream.java From dremio-flight-connector with Apache License 2.0 | 6 votes |
public boolean drain() throws InterruptedException { if (isDone.get()) { logger.debug("already done, returning. {}", descriptor); return true; } logger.debug("it is not done. try to take from exchanger for {}", descriptor); ArrowRecordBatch batch = exchanger.poll(1, TimeUnit.SECONDS); if (batch == null) { logger.debug("timed out waiting, {}", descriptor); return false; } logger.debug("got batch, send to loader. exchanger size is {} for {}", exchanger.size(), descriptor); if (batch.getLength() == -1) { logger.debug("{} is done, stopping it", descriptor); return true; } loader.load(batch); logger.debug("tell listener to put next for {}", descriptor); listener.putNext(); batch.close(); logger.debug("got a batch and sent to listener for {}", descriptor); return false; }
Example #6
Source File: FragmentWritableBatch.java From dremio-oss with Apache License 2.0 | 6 votes |
public static FragmentWritableBatch create( final QueryId queryId, final int sendMajorFragmentId, final int sendMinorFragmentId, final int receiveMajorFragmentId, final VectorAccessible batch, final int receiveMinorFragmentId) { ArrowRecordBatch recordBatch = getArrowRecordBatch(batch); return new FragmentWritableBatch( queryId, sendMajorFragmentId, sendMinorFragmentId, receiveMajorFragmentId, recordBatch, receiveMinorFragmentId ); }
Example #7
Source File: FragmentWritableBatch.java From dremio-oss with Apache License 2.0 | 6 votes |
public FragmentWritableBatch( final QueryId queryId, final int sendMajorFragmentId, final int sendMinorFragmentId, final int receiveMajorFragmentId, ArrowRecordBatch recordBatch, final int... receiveMinorFragmentId){ this.buffers = recordBatch.getBuffers().stream().map(buf -> buf.asNettyBuffer()).collect (Collectors.toList()).toArray(new ByteBuf[0]); this.recordCount = recordBatch.getLength(); FlatBufferBuilder fbbuilder = new FlatBufferBuilder(); fbbuilder.finish(recordBatch.writeTo(fbbuilder)); ByteBuffer arrowRecordBatch = fbbuilder.dataBuffer(); final FragmentRecordBatch.Builder builder = FragmentRecordBatch.newBuilder() .setArrowRecordBatch(ByteString.copyFrom(arrowRecordBatch)) .setQueryId(queryId) .setReceivingMajorFragmentId(receiveMajorFragmentId) .setSendingMajorFragmentId(sendMajorFragmentId) .setSendingMinorFragmentId(sendMinorFragmentId); for(final int i : receiveMinorFragmentId){ builder.addReceivingMinorFragmentId(i); } this.header = builder.build(); }
Example #8
Source File: ArrowConverter.java From DataVec with Apache License 2.0 | 6 votes |
/** * Read a datavec schema and record set * from the given arrow file. * @param input the input to read * @return the associated datavec schema and record */ public static Pair<Schema,ArrowWritableRecordBatch> readFromFile(FileInputStream input) throws IOException { BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); Schema retSchema = null; ArrowWritableRecordBatch ret = null; SeekableReadChannel channel = new SeekableReadChannel(input.getChannel()); ArrowFileReader reader = new ArrowFileReader(channel, allocator); reader.loadNextBatch(); retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema()); //load the batch VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot()); VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot()); ArrowRecordBatch recordBatch = unloader.getRecordBatch(); vectorLoader.load(recordBatch); ret = asDataVecBatch(recordBatch,retSchema,reader.getVectorSchemaRoot()); ret.setUnloader(unloader); return Pair.of(retSchema,ret); }
Example #9
Source File: ArrowConverter.java From DataVec with Apache License 2.0 | 6 votes |
/** * Read a datavec schema and record set * from the given bytes (usually expected to be an arrow format file) * @param input the input to read * @return the associated datavec schema and record */ public static Pair<Schema,ArrowWritableRecordBatch> readFromBytes(byte[] input) throws IOException { BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); Schema retSchema = null; ArrowWritableRecordBatch ret = null; SeekableReadChannel channel = new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(input)); ArrowFileReader reader = new ArrowFileReader(channel, allocator); reader.loadNextBatch(); retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema()); //load the batch VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot()); VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot()); ArrowRecordBatch recordBatch = unloader.getRecordBatch(); vectorLoader.load(recordBatch); ret = asDataVecBatch(recordBatch,retSchema,reader.getVectorSchemaRoot()); ret.setUnloader(unloader); return Pair.of(retSchema,ret); }
Example #10
Source File: BlockDeserializer.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
@Override public Block deserialize(JsonParser jsonParser, DeserializationContext deserializationContext) throws IOException { JsonNode node = jsonParser.getCodec().readTree(jsonParser); String allocatorId = node.get(BlockSerializer.ALLOCATOR_ID_FIELD_NAME).asText(); byte[] schemaBytes = node.get(BlockSerializer.SCHEMA_FIELD_NAME).binaryValue(); byte[] batchBytes = node.get(BlockSerializer.BATCH_FIELD_NAME).binaryValue(); Schema schema = schemaSerDe.deserialize(new ByteArrayInputStream(schemaBytes)); Block block = getOrCreateAllocator(allocatorId).createBlock(schema); if (batchBytes.length > 0) { ArrowRecordBatch batch = deserializeBatch(allocatorId, batchBytes); block.loadRecordBatch(batch); } return block; }
Example #11
Source File: AbstractArrowSourceFunction.java From flink with Apache License 2.0 | 6 votes |
@Override public void run(SourceContext<OUT> ctx) throws Exception { VectorLoader vectorLoader = new VectorLoader(root); while (running && !indexesToEmit.isEmpty()) { Tuple2<Integer, Integer> indexToEmit = indexesToEmit.peek(); ArrowRecordBatch arrowRecordBatch = loadBatch(indexToEmit.f0); vectorLoader.load(arrowRecordBatch); arrowRecordBatch.close(); ArrowReader<OUT> arrowReader = createArrowReader(root); int rowCount = root.getRowCount(); int nextRowId = indexToEmit.f1; while (nextRowId < rowCount) { OUT element = arrowReader.read(nextRowId); synchronized (ctx.getCheckpointLock()) { ctx.collect(element); indexToEmit.setField(++nextRowId, 1); } } synchronized (ctx.getCheckpointLock()) { indexesToEmit.pop(); } } }
Example #12
Source File: BlockSerDe.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
private ArrowRecordBatch deserializeRecordBatch(BlockAllocator allocator, byte[] in) { AtomicReference<ArrowRecordBatch> batch = new AtomicReference<>(); try { return allocator.registerBatch((BufferAllocator root) -> { batch.set((ArrowRecordBatch) MessageSerializer.deserializeMessageBatch( new ReadChannel(Channels.newChannel(new ByteArrayInputStream(in))), root)); return batch.get(); }); } catch (Exception ex) { if (batch.get() != null) { batch.get().close(); } throw ex; } }
Example #13
Source File: BlockSerDe.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
@Override protected Block doDeserialize(JsonParser jparser, DeserializationContext ctxt) throws IOException { String allocatorId = getNextStringField(jparser, ALLOCATOR_ID_FIELD_NAME); assertFieldName(jparser, SCHEMA_FIELD_NAME); Schema schema = schemaDeserializer.deserialize(jparser, ctxt); byte[] batchBytes = getNextBinaryField(jparser, BATCH_FIELD_NAME); Block block = getOrCreateAllocator(allocatorId).createBlock(schema); if (batchBytes.length > 0) { ArrowRecordBatch batch = deserializeBatch(allocatorId, batchBytes); block.loadRecordBatch(batch); } return block; }
Example #14
Source File: ArrowConverter.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Read a datavec schema and record set * from the given arrow file. * @param input the input to read * @return the associated datavec schema and record */ public static Pair<Schema,ArrowWritableRecordBatch> readFromFile(FileInputStream input) throws IOException { BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); Schema retSchema = null; ArrowWritableRecordBatch ret = null; SeekableReadChannel channel = new SeekableReadChannel(input.getChannel()); ArrowFileReader reader = new ArrowFileReader(channel, allocator); reader.loadNextBatch(); retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema()); //load the batch VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot()); VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot()); ArrowRecordBatch recordBatch = unloader.getRecordBatch(); vectorLoader.load(recordBatch); ret = asDataVecBatch(recordBatch,retSchema,reader.getVectorSchemaRoot()); ret.setUnloader(unloader); return Pair.of(retSchema,ret); }
Example #15
Source File: ArrowConverter.java From deeplearning4j with Apache License 2.0 | 6 votes |
/** * Read a datavec schema and record set * from the given bytes (usually expected to be an arrow format file) * @param input the input to read * @return the associated datavec schema and record */ public static Pair<Schema,ArrowWritableRecordBatch> readFromBytes(byte[] input) throws IOException { BufferAllocator allocator = new RootAllocator(Long.MAX_VALUE); Schema retSchema = null; ArrowWritableRecordBatch ret = null; SeekableReadChannel channel = new SeekableReadChannel(new ByteArrayReadableSeekableByteChannel(input)); ArrowFileReader reader = new ArrowFileReader(channel, allocator); reader.loadNextBatch(); retSchema = toDatavecSchema(reader.getVectorSchemaRoot().getSchema()); //load the batch VectorUnloader unloader = new VectorUnloader(reader.getVectorSchemaRoot()); VectorLoader vectorLoader = new VectorLoader(reader.getVectorSchemaRoot()); ArrowRecordBatch recordBatch = unloader.getRecordBatch(); vectorLoader.load(recordBatch); ret = asDataVecBatch(recordBatch,retSchema,reader.getVectorSchemaRoot()); ret.setUnloader(unloader); return Pair.of(retSchema,ret); }
Example #16
Source File: RecordBatchSerDe.java From aws-athena-query-federation with Apache License 2.0 | 6 votes |
/** * Attempts to deserialize the provided byte[] into an ArrowRecordBatch. * * @param in The byte[] that is expected to contain a serialized ArrowRecordBatch. * @return The resulting ArrowRecordBatch if the byte[] contains a valid ArrowRecordBatch. * @throws IOException */ public ArrowRecordBatch deserialize(byte[] in) throws IOException { ArrowRecordBatch batch = null; try { return allocator.registerBatch((BufferAllocator root) -> (ArrowRecordBatch) MessageSerializer.deserializeMessageBatch( new ReadChannel(Channels.newChannel(new ByteArrayInputStream(in))), root) ); } catch (Exception ex) { if (batch != null) { batch.close(); } throw ex; } }
Example #17
Source File: ArrowRecordBatchSerDe.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
@Override protected void doSerialize(ArrowRecordBatch arrowRecordBatch, JsonGenerator jgen, SerializerProvider provider) throws IOException { try { ByteArrayOutputStream out = new ByteArrayOutputStream(); MessageSerializer.serialize(new WriteChannel(Channels.newChannel(out)), arrowRecordBatch); jgen.writeBinary(out.toByteArray()); } finally { arrowRecordBatch.close(); } }
Example #18
Source File: ArrowConverter.java From deeplearning4j with Apache License 2.0 | 5 votes |
private static ArrowWritableRecordBatch asDataVecBatch(ArrowRecordBatch arrowRecordBatch, Schema schema, VectorSchemaRoot vectorLoader) { //iterate column wise over the feature vectors, returning entries List<FieldVector> fieldVectors = new ArrayList<>(); for(int j = 0; j < schema.numColumns(); j++) { String name = schema.getName(j); FieldVector fieldVector = vectorLoader.getVector(name); fieldVectors.add(fieldVector); } ArrowWritableRecordBatch ret = new ArrowWritableRecordBatch(fieldVectors, schema); ret.setArrowRecordBatch(arrowRecordBatch); return ret; }
Example #19
Source File: ArrowRecordBatchLoader.java From dremio-oss with Apache License 2.0 | 5 votes |
public static ArrowRecordBatch deserializeRecordBatch(RecordBatch recordBatchFB, ArrowBuf body) throws IOException { // Now read the body int nodesLength = recordBatchFB.nodesLength(); List<ArrowFieldNode> nodes = new ArrayList<>(); for (int i = 0; i < nodesLength; ++i) { FieldNode node = recordBatchFB.nodes(i); if ((int)node.length() != node.length() || (int)node.nullCount() != node.nullCount()) { throw new IOException("Cannot currently deserialize record batches with " + "node length larger than Int.MAX_VALUE"); } nodes.add(new ArrowFieldNode((int)node.length(), (int)node.nullCount())); } List<ArrowBuf> buffers = new ArrayList<>(); for (int i = 0; i < recordBatchFB.buffersLength(); ++i) { Buffer bufferFB = recordBatchFB.buffers(i); ArrowBuf vectorBuffer = body.slice((int)bufferFB.offset(), (int)bufferFB.length()); buffers.add(vectorBuffer); } if ((int)recordBatchFB.length() != recordBatchFB.length()) { throw new IOException("Cannot currently deserialize record batches over 2GB"); } ArrowRecordBatch arrowRecordBatch = new ArrowRecordBatch((int)recordBatchFB.length(), nodes, buffers, false); for (ArrowBuf buf : buffers) { buf.release(); } return arrowRecordBatch; }
Example #20
Source File: ArrowUtils.java From konduit-serving with Apache License 2.0 | 5 votes |
public static ArrowWritableRecordBatch asDataVecBatch(ArrowRecordBatch arrowRecordBatch, Schema schema, VectorSchemaRoot vectorLoader) { List<FieldVector> fieldVectors = new ArrayList(); for (int j = 0; j < schema.numColumns(); ++j) { String name = schema.getName(j); FieldVector fieldVector = vectorLoader.getVector(name); fieldVectors.add(fieldVector); } ArrowWritableRecordBatch ret = new ArrowWritableRecordBatch(fieldVectors, schema); ret.setArrowRecordBatch(arrowRecordBatch); return ret; }
Example #21
Source File: BlockAllocatorImpl.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * Attempts to close all batches allocated by this BlockAllocator. */ @VisibleForTesting protected synchronized void closeBatches() { logger.debug("closeBatches: {}", recordBatches.size()); for (ArrowRecordBatch next : recordBatches) { try { next.close(); } catch (Exception ex) { logger.warn("closeBatches: Error closing batch", ex); } } recordBatches.clear(); }
Example #22
Source File: RecordBatchSerDe.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
/** * Serialized the provided ArrowRecordBatch to the provided OutputStream and closes the batch once * it is fully written to the OutputStream. * * @param batch The ArrowRecordBatch to serialize. * @param out The OutputStream to write to. * @throws IOException */ public void serialize(ArrowRecordBatch batch, OutputStream out) throws IOException { try { MessageSerializer.serialize(new WriteChannel(Channels.newChannel(out)), batch); } finally { batch.close(); } }
Example #23
Source File: FormationRecordWriter.java From dremio-flight-connector with Apache License 2.0 | 5 votes |
@Override public int writeBatch(int offset, int length) throws IOException { root.setRowCount(length); try (ArrowRecordBatch arb = unloader.getRecordBatch()) { long size = arb.computeBodyLength(); creator.add(arb); return (int)size; } catch (InterruptedException e) { throw new IOException(e); } }
Example #24
Source File: BlockSerDe.java From aws-athena-query-federation with Apache License 2.0 | 5 votes |
private byte[] serializeRecordBatch(ArrowRecordBatch recordBatch) throws IOException { try { ByteArrayOutputStream out = new ByteArrayOutputStream(); MessageSerializer.serialize(new WriteChannel(Channels.newChannel(out)), recordBatch); return out.toByteArray(); } finally { recordBatch.close(); } }
Example #25
Source File: ArrowConverter.java From DataVec with Apache License 2.0 | 5 votes |
private static ArrowWritableRecordBatch asDataVecBatch(ArrowRecordBatch arrowRecordBatch, Schema schema, VectorSchemaRoot vectorLoader) { //iterate column wise over the feature vectors, returning entries List<FieldVector> fieldVectors = new ArrayList<>(); for(int j = 0; j < schema.numColumns(); j++) { String name = schema.getName(j); FieldVector fieldVector = vectorLoader.getVector(name); fieldVectors.add(fieldVector); } ArrowWritableRecordBatch ret = new ArrowWritableRecordBatch(fieldVectors, schema); ret.setArrowRecordBatch(arrowRecordBatch); return ret; }
Example #26
Source File: FragmentWritableBatch.java From dremio-oss with Apache License 2.0 | 4 votes |
public static ArrowRecordBatch getArrowRecordBatch(final VectorAccessible batch) { VectorSchemaRoot root = getVectorSchemaRoot(batch); VectorUnloader unloader = new VectorUnloader(root, false, false); ArrowRecordBatch recordBatch = unloader.getRecordBatch(); return recordBatch; }
Example #27
Source File: AbstractArrowSourceFunction.java From flink with Apache License 2.0 | 4 votes |
/** * Load the specified batch of data to process. */ private ArrowRecordBatch loadBatch(int nextIndexOfArrowDataToProcess) throws IOException { ByteArrayInputStream bais = new ByteArrayInputStream(arrowData[nextIndexOfArrowDataToProcess]); return MessageSerializer.deserializeRecordBatch(new ReadChannel(Channels.newChannel(bais)), allocator); }
Example #28
Source File: Stream.java From dremio-flight-connector with Apache License 2.0 | 4 votes |
private int size() { return exchanger.stream().map(ArrowRecordBatch::getLength).mapToInt(Integer::intValue).sum(); }
Example #29
Source File: Stream.java From dremio-flight-connector with Apache License 2.0 | 4 votes |
public Consumer(BufferAllocator allocator, FlightDescriptor descriptor, BlockingQueue<ArrowRecordBatch> exchanger) { this.allocator = allocator.newChildAllocator("consumer", 0, Long.MAX_VALUE); this.descriptor = descriptor; this.exchanger = exchanger; }
Example #30
Source File: Block.java From aws-athena-query-federation with Apache License 2.0 | 4 votes |
/** * Used to unload the Apache Arrow data in this Block in preparation for Serialization. * * @return An ArrowRecordBatch containing all row data in this Block for use in serializing the Block. */ public ArrowRecordBatch getRecordBatch() { VectorUnloader vectorUnloader = new VectorUnloader(vectorSchema); return vectorUnloader.getRecordBatch(); }