org.apache.spark.sql.catalyst.InternalRow Java Exaples

Source File: HiveStreamingDataWriter.java From spark-llap with Apache License 2.0

6 votes

@Override
public void write(final InternalRow record) throws IOException {
  String delimitedRow = Joiner.on(",").useForNull("")
    .join(scala.collection.JavaConversions.seqAsJavaList(record.toSeq(schema)));
  try {
    streamingConnection.write(delimitedRow.getBytes(Charset.forName("UTF-8")));
    rowsWritten++;
    if (rowsWritten > 0 && commitAfterNRows > 0 && (rowsWritten % commitAfterNRows == 0)) {
      LOG.info("Committing transaction after rows: {}", rowsWritten);
      streamingConnection.commitTransaction();
      streamingConnection.beginTransaction();
    }
  } catch (StreamingException e) {
    throw new IOException(e);
  }
}

Source File: SparkParquetReadersFlatDataBenchmark.java From iceberg with Apache License 2.0

6 votes

@Benchmark
@Threads(1)
public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type))
      .build()) {

    Iterable<InternalRow> unsafeRows = Iterables.transform(
        rows,
        APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke);

    for (InternalRow row : unsafeRows) {
      blackhole.consume(row);
    }
  }
}

Source File: TestHelpers.java From iceberg with Apache License 2.0

6 votes

public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch,
                                     boolean checkArrowValidityVector) {
  for (int rowId = 0; rowId < batch.numRows(); rowId++) {
    List<Types.NestedField> fields = struct.fields();
    InternalRow row = batch.getRow(rowId);
    Record rec = expected.next();
    for (int i = 0; i < fields.size(); i += 1) {
      Type fieldType = fields.get(i).type();
      Object expectedValue = rec.get(i);
      Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType));
      assertEqualsUnsafe(fieldType, expectedValue, actualValue);

      if (checkArrowValidityVector) {
        ColumnVector columnVector = batch.column(i);
        ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector();
        Assert.assertEquals("Nullability doesn't match", expectedValue == null, arrowVector.isNull(rowId));
      }
    }
  }
}

Source File: PartitionedWriter.java From iceberg with Apache License 2.0

6 votes

@Override
public void write(InternalRow row) throws IOException {
  key.partition(row);

  PartitionKey currentKey = getCurrentKey();
  if (!key.equals(currentKey)) {
    closeCurrent();
    completedPartitions.add(currentKey);

    if (completedPartitions.contains(key)) {
      // if rows are not correctly grouped, detect and fail the write
      PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null);
      LOG.warn("Duplicate key: {} == {}", existingKey, key);
      throw new IllegalStateException("Already closed files for partition: " + key.toPath());
    }

    setCurrentKey(key.copy());
    openCurrent();
  }

  writeInternal(row);
}

Source File: TestDataFileSerialization.java From iceberg with Apache License 2.0

6 votes

@Test
public void testParquetWriterSplitOffsets() throws IOException {
  Iterable<InternalRow> records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L);
  File parquetFile = new File(
      temp.getRoot(),
      FileFormat.PARQUET.addExtension(UUID.randomUUID().toString()));
  FileAppender<InternalRow> writer =
      Parquet.write(Files.localOutput(parquetFile))
          .schema(DATE_SCHEMA)
          .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType))
          .build();
  try {
    writer.addAll(records);
  } finally {
    writer.close();
  }

  Kryo kryo = new KryoSerializer(new SparkConf()).newKryo();
  File dataFile = temp.newFile();
  try (Output out = new Output(new FileOutputStream(dataFile))) {
    kryo.writeClassAndObject(out, writer.splitOffsets());
  }
  try (Input in = new Input(new FileInputStream(dataFile))) {
    kryo.readClassAndObject(in);
  }
}

Source File: CodegenExamples.java From iceberg with Apache License 2.0

6 votes

public UnsafeRow apply(InternalRow i) {
  holder.reset();

  rowWriter.zeroOutNullBytes();


  boolean isNull = i.isNullAt(0);
  long value = isNull ? -1L : (i.getLong(0));
  if (isNull) {
    rowWriter.setNullAt(0);
  } else {
    rowWriter.write(0, value);
  }


  boolean isNull1 = i.isNullAt(1);
  UTF8String value1 = isNull1 ? null : (i.getUTF8String(1));
  if (isNull1) {
    rowWriter.setNullAt(1);
  } else {
    rowWriter.write(1, value1);
  }
  result.setTotalSize(holder.totalSize());
  return result;
}

Source File: SparkParquetReadersNestedDataBenchmark.java From iceberg with Apache License 2.0

6 votes

@Benchmark
@Threads(1)
public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException {
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type))
      .build()) {

    Iterable<InternalRow> unsafeRows = Iterables.transform(
        rows,
        APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke);

    for (InternalRow row : unsafeRows) {
      blackhole.consume(row);
    }
  }
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

6 votes

@SuppressWarnings("unchecked")
PartitionKey(PartitionSpec spec, Schema inputSchema) {
  this.spec = spec;

  List<PartitionField> fields = spec.fields();
  this.size = fields.size();
  this.partitionTuple = new Object[size];
  this.transforms = new Transform[size];
  this.accessors = (Accessor<InternalRow>[]) Array.newInstance(Accessor.class, size);

  Schema schema = spec.schema();
  Map<Integer, Accessor<InternalRow>> newAccessors = buildAccessors(inputSchema);
  for (int i = 0; i < size; i += 1) {
    PartitionField field = fields.get(i);
    Accessor<InternalRow> accessor = newAccessors.get(field.sourceId());
    if (accessor == null) {
      throw new RuntimeException(
          "Cannot build accessor for field: " + schema.findField(field.sourceId()));
    }
    this.accessors[i] = accessor;
    this.transforms[i] = field.transform();
  }
}

Source File: SparkParquetReadersNestedDataBenchmark.java From iceberg with Apache License 2.0

6 votes

@Benchmark
@Threads(1)
public void readUsingSparkReader(Blackhole blackhole) throws IOException {
  StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA);
  try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile))
      .project(SCHEMA)
      .readSupport(new ParquetReadSupport())
      .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json())
      .set("spark.sql.parquet.binaryAsString", "false")
      .set("spark.sql.parquet.int96AsTimestamp", "false")
      .callInit()
      .build()) {

    for (InternalRow row : rows) {
      blackhole.consume(row);
    }
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

6 votes

@Test
public void testUnpartitionedIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  IcebergSource source = new IcebergSource();

  for (int i = 0; i < 10; i += 1) {
    DataSourceReader reader = source.createReader(options);

    pushFilters(reader, EqualTo.apply("id", i));

    List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
    Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

    // validate row filtering
    assertEqualsSafe(SCHEMA.asStruct(), expected(i),
        read(unpartitioned.toString(), "id = " + i));
  }
}

Source File: Writer.java From iceberg with Apache License 2.0

6 votes

@Override
public void write(InternalRow row) throws IOException {
  key.partition(row);

  if (!key.equals(currentKey)) {
    closeCurrent();

    if (completedPartitions.contains(key)) {
      // if rows are not correctly grouped, detect and fail the write
      PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null);
      LOG.warn("Duplicate key: {} == {}", existingKey, key);
      throw new IllegalStateException("Already closed file for partition: " + key.toPath());
    }

    this.currentKey = key.copy();
    this.currentPath = outputPathFunc.apply(currentKey);
    OutputFile file = HadoopOutputFile.fromPath(currentPath, conf);
    this.currentAppender = factory.newAppender(file, format);
  }

  currentAppender.add(row);
}

Source File: BigQueryDataSourceReader.java From spark-bigquery-connector with Apache License 2.0

6 votes

@Override
public List<InputPartition<InternalRow>> planInputPartitions() {
    if (schema.map(StructType::isEmpty).orElse(false)) {
        // create empty projection
        return createEmptyProjectionPartitions();
    }

    ImmutableList<String> selectedFields = schema
            .map(requiredSchema -> ImmutableList.copyOf(requiredSchema.fieldNames()))
            .orElse(ImmutableList.of());
    Optional<String> filter = emptyIfNeeded(SparkFilterUtils.getCompiledFilter(
            readSessionCreatorConfig.getReadDataFormat(), globalFilter, pushedFilters));
    ReadSessionResponse readSessionResponse = readSessionCreator.create(
            tableId, selectedFields, filter, readSessionCreatorConfig.getMaxParallelism());
    ReadSession readSession = readSessionResponse.getReadSession();
    return readSession.getStreamsList().stream()
            .map(stream -> new BigQueryInputPartition(
                    bigQueryReadClientFactory,
                    stream.getName(),
                    readSessionCreatorConfig.getMaxReadRowsRetries(),
                    createConverter(selectedFields, readSessionResponse)))
            .collect(Collectors.toList());
}

Source File: SparkParquetReaders.java From iceberg with Apache License 2.0

5 votes

@Override
protected GenericInternalRow newStructData(InternalRow reuse) {
  if (reuse instanceof GenericInternalRow) {
    return (GenericInternalRow) reuse;
  } else {
    return new GenericInternalRow(numFields);
  }
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

5 votes

@SuppressWarnings("unchecked")
void partition(InternalRow row) {
  for (int i = 0; i < partitionTuple.length; i += 1) {
    Transform<Object, Object> transform = transforms[i];
    partitionTuple[i] = transform.apply(accessors[i].get(row));
  }
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

5 votes

private static Accessor<InternalRow> newAccessor(int position, boolean isOptional, Types.StructType type,
                                                 Accessor<InternalRow> accessor) {
  int size = type.fields().size();
  if (isOptional) {
    // the wrapped position handles null layers
    return new WrappedPositionAccessor(position, size, accessor);
  } else if (accessor.getClass() == PositionAccessor.class) {
    return new Position2Accessor(position, size, (PositionAccessor) accessor);
  } else if (accessor instanceof Position2Accessor) {
    return new Position3Accessor(position, size, (Position2Accessor) accessor);
  } else {
    return new WrappedPositionAccessor(position, size, accessor);
  }
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

5 votes

private static Accessor<InternalRow> newAccessor(int position, Type type) {
  switch (type.typeId()) {
    case STRING:
      return new StringAccessor(position, SparkSchemaUtil.convert(type));
    case DECIMAL:
      return new DecimalAccessor(position, SparkSchemaUtil.convert(type));
    case BINARY:
      return new BytesAccessor(position, SparkSchemaUtil.convert(type));
    default:
      return new PositionAccessor(position, SparkSchemaUtil.convert(type));
  }
}

Source File: RowDataReader.java From iceberg with Apache License 2.0

5 votes

private CloseableIterable<InternalRow> newDataIterable(DataTask task, Schema readSchema) {
  StructInternalRow row = new StructInternalRow(tableSchema.asStruct());
  CloseableIterable<InternalRow> asSparkRows = CloseableIterable.transform(
      task.asDataTask().rows(), row::setStruct);
  return CloseableIterable.transform(
      asSparkRows, APPLY_PROJECTION.bind(projection(readSchema, tableSchema))::invoke);
}

Source File: RowDataReader.java From iceberg with Apache License 2.0

5 votes

private CloseableIterable<InternalRow> newOrcIterable(
    InputFile location,
    FileScanTask task,
    Schema readSchema,
    Map<Integer, ?> idToConstant) {
  return ORC.read(location)
      .project(readSchema)
      .split(task.start(), task.length())
      .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant))
      .filter(task.residual())
      .caseSensitive(caseSensitive)
      .build();
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

5 votes

@Override
public Object get(InternalRow row) {
  if (row.isNullAt(position())) {
    return null;
  }
  return ByteBuffer.wrap((byte[]) row.get(position(), type()));
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

5 votes

@Override
public Object get(InternalRow row) {
  if (row.isNullAt(p)) {
    return null;
  }
  return row.get(p, type);
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

5 votes

@Override
public Object get(InternalRow row) {
  if (row.isNullAt(position())) {
    return null;
  }
  return row.get(position(), type()).toString();
}

Source File: SparkValueReaders.java From iceberg with Apache License 2.0

5 votes

@Override
protected InternalRow reuseOrCreate(Object reuse) {
  if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) {
    return (InternalRow) reuse;
  }
  return new GenericInternalRow(numFields);
}

Source File: Reader.java From iceberg with Apache License 2.0

5 votes

private CloseableIterable<InternalRow> newAvroIterable(InputFile location,
                                                  FileScanTask task,
                                                  Schema readSchema) {
  return Avro.read(location)
      .reuseContainers()
      .project(readSchema)
      .split(task.start(), task.length())
      .createReaderFunc(SparkAvroReader::new)
      .build();
}

Source File: SparkParquetReaders.java From iceberg with Apache License 2.0

5 votes

@Override
protected GenericInternalRow newStructData(InternalRow reuse) {
  if (reuse instanceof GenericInternalRow) {
    return (GenericInternalRow) reuse;
  } else {
    return new GenericInternalRow(numFields);
  }
}

Source File: Writer.java From iceberg with Apache License 2.0

5 votes

PartitionedWriter(PartitionSpec spec, FileFormat format, Configuration conf,
                  AppenderFactory<InternalRow> factory,
                  Function<PartitionKey, Path> outputPathFunc) {
  this.spec = spec;
  this.format = format;
  this.conf = conf;
  this.factory = factory;
  this.outputPathFunc = outputPathFunc;
  this.key = new PartitionKey(spec);
}

Source File: PartitionKey.java From iceberg with Apache License 2.0

5 votes

private static Accessor<InternalRow> newAccessor(int p, Type type) {
  switch (type.typeId()) {
    case STRING:
      return new StringAccessor(p, convert(type));
    case DECIMAL:
      return new DecimalAccessor(p, convert(type));
    default:
      return new PositionAccessor(p, convert(type));
  }
}

Source File: TestDataFrameWrites.java From iceberg with Apache License 2.0

5 votes

private Dataset<Row> createDataset(List<Record> records, Schema schema) throws IOException {
  // this uses the SparkAvroReader to create a DataFrame from the list of records
  // it assumes that SparkAvroReader is correct
  File testFile = temp.newFile();
  Assert.assertTrue("Delete should succeed", testFile.delete());

  try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile))
      .schema(schema)
      .named("test")
      .build()) {
    for (Record rec : records) {
      writer.add(rec);
    }
  }

  List<InternalRow> rows;
  try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile))
      .createReaderFunc(SparkAvroReader::new)
      .project(schema)
      .build()) {
    rows = Lists.newArrayList(reader);
  }

  // make sure the dataframe matches the records before moving on
  for (int i = 0; i < records.size(); i += 1) {
    assertEqualsUnsafe(schema.asStruct(), records.get(i), rows.get(i));
  }

  JavaRDD<InternalRow> rdd = sc.parallelize(rows);
  return spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(schema), false);
}

Source File: ColumnarBatch.java From spliceengine with GNU Affero General Public License v3.0

5 votes

/**
 * Returns an iterator over the rows in this batch.
 */
public Iterator<InternalRow> rowIterator() {
    final int maxRows = numRows;
    final MutableColumnarRow row = new MutableColumnarRow(columns);
    return new Iterator<InternalRow>() {
        int rowId = 0;

        @Override
        public boolean hasNext() {
            return rowId < maxRows;
        }

        @Override
        public InternalRow next() {
            if (rowId >= maxRows) {
                throw new NoSuchElementException();
            }
            row.rowId = rowId++;
            return row;
        }

        @Override
        public void remove() {
            throw new UnsupportedOperationException();
        }
    };
}

Source File: SparkValueWriters.java From iceberg with Apache License 2.0

5 votes

@Override
public void write(InternalRow row, Encoder encoder) throws IOException {
  for (int i = 0; i < types.length; i += 1) {
    if (row.isNullAt(i)) {
      writers[i].write(null, encoder);
    } else {
      write(row, i, writers[i], encoder);
    }
  }
}

Source File: TestFilteredScan.java From iceberg with Apache License 2.0

5 votes

@Test
public void testUnpartitionedCaseInsensitiveIDFilters() {
  DataSourceOptions options = new DataSourceOptions(ImmutableMap.of(
      "path", unpartitioned.toString())
  );

  // set spark.sql.caseSensitive to false
  String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive");
  TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false");

  try {
    IcebergSource source = new IcebergSource();

    for (int i = 0; i < 10; i += 1) {
      DataSourceReader reader = source.createReader(options);

      pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match

      List<InputPartition<InternalRow>> tasks = reader.planInputPartitions();
      Assert.assertEquals("Should only create one task for a small file", 1, tasks.size());

      // validate row filtering
      assertEqualsSafe(SCHEMA.asStruct(), expected(i),
          read(unpartitioned.toString(), "id = " + i));
    }
  } finally {
    // return global conf to previous state
    TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest);
  }
}

org.apache.spark.sql.catalyst.InternalRow Java Examples