org.datavec.arrow.ArrowConverter Java Exaples

Source File: ArrowWritableRecordBatch.java From DataVec with Apache License 2.0

6 votes

@Override
public List<Writable> get(int i) {
    List<Writable> ret = new ArrayList<>(schema.numColumns());
    for(int column = 0; column < schema.numColumns(); column++) {
        try {
            if (!list.get(column).isNull(offset + i))
                ret.add(ArrowConverter.fromEntry(offset + i, list.get(column), schema.getType(column)));
            else {
                ret.add(NullWritable.INSTANCE);
            }
        }catch (Exception e) {
            ret.add(NullWritable.INSTANCE);

        }
    }
    return ret;
}

Source File: ArrowWritableRecordBatch.java From DataVec with Apache License 2.0

6 votes

@Override
public List<Writable> set(int i, List<Writable> writable) {
    int rowOffset = offset + i;
    List<Writable> old = get(i);
    if(writable.size() != schema.numColumns()) {
        throw new IllegalArgumentException("Unable to set value. Wrong input types coming in");
    }

    int colIdx = 0;
    for(FieldVector fieldVector : list) {
        ArrowConverter.setValue(schema.getType(colIdx),fieldVector,writable.get(colIdx),rowOffset);
        colIdx++;
    }

    return old;
}

Source File: ArrowRecordWriter.java From DataVec with Apache License 2.0

6 votes

@Override
public PartitionMetaData writeBatch(List<List<Writable>> batch) throws IOException {
    if(partitioner.needsNewPartition()) {
        partitioner.currentOutputStream().flush();
        partitioner.currentOutputStream().close();
        partitioner.openNewStream();
    }

    if(batch instanceof ArrowWritableRecordBatch) {
        ArrowWritableRecordBatch arrowWritableRecordBatch = (ArrowWritableRecordBatch) batch;
        ArrowConverter.writeRecordBatchTo(arrowWritableRecordBatch,schema,partitioner.currentOutputStream());
    }
    else {
        ArrowConverter.writeRecordBatchTo(batch, schema, partitioner.currentOutputStream());
    }

    partitioner.currentOutputStream().flush();
    return PartitionMetaData.builder().numRecordsUpdated(batch.size()).build();
}

Source File: ArrowWritableRecordBatch.java From deeplearning4j with Apache License 2.0

6 votes

@Override
public List<Writable> get(int i) {
    List<Writable> ret = new ArrayList<>(schema.numColumns());
    for(int column = 0; column < schema.numColumns(); column++) {
        try {
            if (!list.get(column).isNull(offset + i))
                ret.add(ArrowConverter.fromEntry(offset + i, list.get(column), schema.getType(column)));
            else {
                ret.add(NullWritable.INSTANCE);
            }
        }catch (Exception e) {
            ret.add(NullWritable.INSTANCE);

        }
    }
    return ret;
}

Source File: ArrowWritableRecordBatch.java From deeplearning4j with Apache License 2.0

6 votes

@Override
public List<Writable> set(int i, List<Writable> writable) {
    int rowOffset = offset + i;
    List<Writable> old = get(i);
    if(writable.size() != schema.numColumns()) {
        throw new IllegalArgumentException("Unable to set value. Wrong input types coming in");
    }

    int colIdx = 0;
    for(FieldVector fieldVector : list) {
        ArrowConverter.setValue(schema.getType(colIdx),fieldVector,writable.get(colIdx),rowOffset);
        colIdx++;
    }

    return old;
}

Source File: ArrowRecordWriter.java From deeplearning4j with Apache License 2.0

6 votes

@Override
public PartitionMetaData writeBatch(List<List<Writable>> batch) throws IOException {
    if(partitioner.needsNewPartition()) {
        partitioner.currentOutputStream().flush();
        partitioner.currentOutputStream().close();
        partitioner.openNewStream();
    }

    if(batch instanceof ArrowWritableRecordBatch) {
        ArrowWritableRecordBatch arrowWritableRecordBatch = (ArrowWritableRecordBatch) batch;
        ArrowConverter.writeRecordBatchTo(arrowWritableRecordBatch,schema,partitioner.currentOutputStream());
    }
    else {
        ArrowConverter.writeRecordBatchTo(batch, schema, partitioner.currentOutputStream());
    }

    partitioner.currentOutputStream().flush();
    return PartitionMetaData.builder().numRecordsUpdated(batch.size()).build();
}

Source File: ArrowWritableRecordTimeSeriesBatchTests.java From DataVec with Apache License 2.0

5 votes

@Test
public void testBasicIndexing() {
    Schema.Builder schema = new Schema.Builder();
    for(int i = 0; i < 3; i++) {
        schema.addColumnInteger(String.valueOf(i));
    }


    List<List<Writable>> timeStep = Arrays.asList(
            Arrays.<Writable>asList(new IntWritable(0),new IntWritable(1),new IntWritable(2)),
            Arrays.<Writable>asList(new IntWritable(1),new IntWritable(2),new IntWritable(3)),
            Arrays.<Writable>asList(new IntWritable(4),new IntWritable(5),new IntWritable(6))
    );

    int numTimeSteps = 5;
    List<List<List<Writable>>> timeSteps = new ArrayList<>(numTimeSteps);
    for(int i = 0; i < numTimeSteps; i++) {
        timeSteps.add(timeStep);
    }

    List<FieldVector> fieldVectors = ArrowConverter.toArrowColumnsTimeSeries(bufferAllocator, schema.build(), timeSteps);
    assertEquals(3,fieldVectors.size());
    for(FieldVector fieldVector : fieldVectors) {
        for(int i = 0; i < fieldVector.getValueCount(); i++) {
            assertFalse("Index " + i + " was null for field vector " + fieldVector, fieldVector.isNull(i));
        }
    }

    ArrowWritableRecordTimeSeriesBatch arrowWritableRecordTimeSeriesBatch = new ArrowWritableRecordTimeSeriesBatch(fieldVectors,schema.build(),timeStep.size() * timeStep.get(0).size());
    assertEquals(timeSteps,arrowWritableRecordTimeSeriesBatch.toArrayList());
}

Source File: LocalTransformExecutor.java From DataVec with Apache License 2.0

5 votes

/**
 * Execute a join on the specified data
 *
 * @param join  Join to execute
 * @param left  Left data for join
 * @param right Right data for join
 * @return Joined data
 */
public static List<List<Writable>> executeJoin(Join join, List<List<Writable>> left,
                                               List<List<Writable>> right) {

    String[] leftColumnNames = join.getJoinColumnsLeft();
    int[] leftColumnIndexes = new int[leftColumnNames.length];
    for (int i = 0; i < leftColumnNames.length; i++) {
        leftColumnIndexes[i] = join.getLeftSchema().getIndexOfColumn(leftColumnNames[i]);
    }
    ExtractKeysFunction extractKeysFunction1 = new ExtractKeysFunction(leftColumnIndexes);

    List<Pair<List<Writable>, List<Writable>>> leftJV = left.stream()
            .filter(input -> input.size() != leftColumnNames.length).map(input ->
            extractKeysFunction1.apply(input)).collect(toList());

    String[] rightColumnNames = join.getJoinColumnsRight();
    int[] rightColumnIndexes = new int[rightColumnNames.length];
    for (int i = 0; i < rightColumnNames.length; i++) {
        rightColumnIndexes[i] = join.getRightSchema().getIndexOfColumn(rightColumnNames[i]);
    }

    ExtractKeysFunction extractKeysFunction = new ExtractKeysFunction(rightColumnIndexes);
    List<Pair<List<Writable>, List<Writable>>> rightJV =
            right.stream().filter(input -> input.size() != rightColumnNames.length)
                    .map(input -> extractKeysFunction.apply(input))
                    .collect(toList());

    Map<List<Writable>, Pair<List<List<Writable>>, List<List<Writable>>>> cogroupedJV = FunctionalUtils.cogroup(leftJV, rightJV);
    ExecuteJoinFromCoGroupFlatMapFunction executeJoinFromCoGroupFlatMapFunction = new ExecuteJoinFromCoGroupFlatMapFunction(join);
    List<List<Writable>> ret =  cogroupedJV.entrySet().stream()
            .flatMap(input ->
                    executeJoinFromCoGroupFlatMapFunction.call(Pair.of(input.getKey(),input.getValue())).stream())
            .collect(toList());

    Schema retSchema = join.getOutputSchema();
    return ArrowConverter.toArrowWritables(ArrowConverter.toArrowColumns(bufferAllocator,retSchema,ret),retSchema);

}

Source File: CSVSparkTransform.java From DataVec with Apache License 2.0

5 votes

/**
 * Convert a raw record via
 * the {@link TransformProcess}
 * to a base 64ed ndarray
 * @param batch the record to convert
 * @return teh base 64ed ndarray
 * @throws IOException
 */
public Base64NDArrayBody toArray(BatchCSVRecord batch) throws IOException {
    List<List<Writable>> converted =  execute(toArrowWritables(toArrowColumnsString(
            bufferAllocator,transformProcess.getInitialSchema(),
            batch.getRecordsAsString()),
            transformProcess.getInitialSchema()),transformProcess);

    ArrowWritableRecordBatch arrowRecordBatch = (ArrowWritableRecordBatch) converted;
    INDArray convert = ArrowConverter.toArray(arrowRecordBatch);
    return new Base64NDArrayBody(Nd4jBase64.base64String(convert));
}

Source File: ArrowWritableRecordTimeSeriesBatchTests.java From deeplearning4j with Apache License 2.0

5 votes

@Test
public void testBasicIndexing() {
    Schema.Builder schema = new Schema.Builder();
    for(int i = 0; i < 3; i++) {
        schema.addColumnInteger(String.valueOf(i));
    }


    List<List<Writable>> timeStep = Arrays.asList(
            Arrays.<Writable>asList(new IntWritable(0),new IntWritable(1),new IntWritable(2)),
            Arrays.<Writable>asList(new IntWritable(1),new IntWritable(2),new IntWritable(3)),
            Arrays.<Writable>asList(new IntWritable(4),new IntWritable(5),new IntWritable(6))
    );

    int numTimeSteps = 5;
    List<List<List<Writable>>> timeSteps = new ArrayList<>(numTimeSteps);
    for(int i = 0; i < numTimeSteps; i++) {
        timeSteps.add(timeStep);
    }

    List<FieldVector> fieldVectors = ArrowConverter.toArrowColumnsTimeSeries(bufferAllocator, schema.build(), timeSteps);
    assertEquals(3,fieldVectors.size());
    for(FieldVector fieldVector : fieldVectors) {
        for(int i = 0; i < fieldVector.getValueCount(); i++) {
            assertFalse("Index " + i + " was null for field vector " + fieldVector, fieldVector.isNull(i));
        }
    }

    ArrowWritableRecordTimeSeriesBatch arrowWritableRecordTimeSeriesBatch = new ArrowWritableRecordTimeSeriesBatch(fieldVectors,schema.build(),timeStep.size() * timeStep.get(0).size());
    assertEquals(timeSteps,arrowWritableRecordTimeSeriesBatch.toArrayList());
}

Source File: LocalTransformExecutor.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Execute a join on the specified data
 *
 * @param join  Join to execute
 * @param left  Left data for join
 * @param right Right data for join
 * @return Joined data
 */
public static List<List<Writable>> executeJoin(Join join, List<List<Writable>> left,
                                               List<List<Writable>> right) {

    String[] leftColumnNames = join.getJoinColumnsLeft();
    int[] leftColumnIndexes = new int[leftColumnNames.length];
    for (int i = 0; i < leftColumnNames.length; i++) {
        leftColumnIndexes[i] = join.getLeftSchema().getIndexOfColumn(leftColumnNames[i]);
    }
    ExtractKeysFunction extractKeysFunction1 = new ExtractKeysFunction(leftColumnIndexes);

    List<Pair<List<Writable>, List<Writable>>> leftJV = left.stream()
            .filter(input -> input.size() != leftColumnNames.length).map(input ->
            extractKeysFunction1.apply(input)).collect(toList());

    String[] rightColumnNames = join.getJoinColumnsRight();
    int[] rightColumnIndexes = new int[rightColumnNames.length];
    for (int i = 0; i < rightColumnNames.length; i++) {
        rightColumnIndexes[i] = join.getRightSchema().getIndexOfColumn(rightColumnNames[i]);
    }

    ExtractKeysFunction extractKeysFunction = new ExtractKeysFunction(rightColumnIndexes);
    List<Pair<List<Writable>, List<Writable>>> rightJV =
            right.stream().filter(input -> input.size() != rightColumnNames.length)
                    .map(input -> extractKeysFunction.apply(input))
                    .collect(toList());

    Map<List<Writable>, Pair<List<List<Writable>>, List<List<Writable>>>> cogroupedJV = FunctionalUtils.cogroup(leftJV, rightJV);
    ExecuteJoinFromCoGroupFlatMapFunction executeJoinFromCoGroupFlatMapFunction = new ExecuteJoinFromCoGroupFlatMapFunction(join);
    List<List<Writable>> ret =  cogroupedJV.entrySet().stream()
            .flatMap(input ->
                    executeJoinFromCoGroupFlatMapFunction.call(Pair.of(input.getKey(),input.getValue())).stream())
            .collect(toList());

    Schema retSchema = join.getOutputSchema();
    return ArrowConverter.toArrowWritables(ArrowConverter.toArrowColumns(bufferAllocator,retSchema,ret),retSchema);

}

Source File: CSVSparkTransform.java From deeplearning4j with Apache License 2.0

5 votes

/**
 * Convert a raw record via
 * the {@link TransformProcess}
 * to a base 64ed ndarray
 * @param batch the record to convert
 * @return teh base 64ed ndarray
 * @throws IOException
 */
public Base64NDArrayBody toArray(BatchCSVRecord batch) throws IOException {
    List<List<Writable>> converted =  execute(toArrowWritables(toArrowColumnsString(
            bufferAllocator,transformProcess.getInitialSchema(),
            batch.getRecordsAsString()),
            transformProcess.getInitialSchema()),transformProcess);

    ArrowWritableRecordBatch arrowRecordBatch = (ArrowWritableRecordBatch) converted;
    INDArray convert = ArrowConverter.toArray(arrowRecordBatch);
    return new Base64NDArrayBody(Nd4jBase64.base64String(convert));
}

org.datavec.arrow.ArrowConverter Java Examples