org.apache.spark.sql.catalyst.InternalRow Java Examples
The following examples show how to use
org.apache.spark.sql.catalyst.InternalRow.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example. You may check out the related API usage on the sidebar.
Example #1
Source File: HiveStreamingDataWriter.java From spark-llap with Apache License 2.0 | 6 votes |
@Override public void write(final InternalRow record) throws IOException { String delimitedRow = Joiner.on(",").useForNull("") .join(scala.collection.JavaConversions.seqAsJavaList(record.toSeq(schema))); try { streamingConnection.write(delimitedRow.getBytes(Charset.forName("UTF-8"))); rowsWritten++; if (rowsWritten > 0 && commitAfterNRows > 0 && (rowsWritten % commitAfterNRows == 0)) { LOG.info("Committing transaction after rows: {}", rowsWritten); streamingConnection.commitTransaction(); streamingConnection.beginTransaction(); } } catch (StreamingException e) { throw new IOException(e); } }
Example #2
Source File: SparkParquetReadersFlatDataBenchmark.java From iceberg with Apache License 2.0 | 6 votes |
@Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile)) .project(SCHEMA) .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) .build()) { Iterable<InternalRow> unsafeRows = Iterables.transform( rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); } } }
Example #3
Source File: TestHelpers.java From iceberg with Apache License 2.0 | 6 votes |
public static void assertEqualsBatch(Types.StructType struct, Iterator<Record> expected, ColumnarBatch batch, boolean checkArrowValidityVector) { for (int rowId = 0; rowId < batch.numRows(); rowId++) { List<Types.NestedField> fields = struct.fields(); InternalRow row = batch.getRow(rowId); Record rec = expected.next(); for (int i = 0; i < fields.size(); i += 1) { Type fieldType = fields.get(i).type(); Object expectedValue = rec.get(i); Object actualValue = row.isNullAt(i) ? null : row.get(i, convert(fieldType)); assertEqualsUnsafe(fieldType, expectedValue, actualValue); if (checkArrowValidityVector) { ColumnVector columnVector = batch.column(i); ValueVector arrowVector = ((IcebergArrowColumnVector) columnVector).vectorAccessor().getVector(); Assert.assertEquals("Nullability doesn't match", expectedValue == null, arrowVector.isNull(rowId)); } } } }
Example #4
Source File: PartitionedWriter.java From iceberg with Apache License 2.0 | 6 votes |
@Override public void write(InternalRow row) throws IOException { key.partition(row); PartitionKey currentKey = getCurrentKey(); if (!key.equals(currentKey)) { closeCurrent(); completedPartitions.add(currentKey); if (completedPartitions.contains(key)) { // if rows are not correctly grouped, detect and fail the write PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null); LOG.warn("Duplicate key: {} == {}", existingKey, key); throw new IllegalStateException("Already closed files for partition: " + key.toPath()); } setCurrentKey(key.copy()); openCurrent(); } writeInternal(row); }
Example #5
Source File: TestDataFileSerialization.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testParquetWriterSplitOffsets() throws IOException { Iterable<InternalRow> records = RandomData.generateSpark(DATE_SCHEMA, 1, 33L); File parquetFile = new File( temp.getRoot(), FileFormat.PARQUET.addExtension(UUID.randomUUID().toString())); FileAppender<InternalRow> writer = Parquet.write(Files.localOutput(parquetFile)) .schema(DATE_SCHEMA) .createWriterFunc(msgType -> SparkParquetWriters.buildWriter(SparkSchemaUtil.convert(DATE_SCHEMA), msgType)) .build(); try { writer.addAll(records); } finally { writer.close(); } Kryo kryo = new KryoSerializer(new SparkConf()).newKryo(); File dataFile = temp.newFile(); try (Output out = new Output(new FileOutputStream(dataFile))) { kryo.writeClassAndObject(out, writer.splitOffsets()); } try (Input in = new Input(new FileInputStream(dataFile))) { kryo.readClassAndObject(in); } }
Example #6
Source File: CodegenExamples.java From iceberg with Apache License 2.0 | 6 votes |
public UnsafeRow apply(InternalRow i) { holder.reset(); rowWriter.zeroOutNullBytes(); boolean isNull = i.isNullAt(0); long value = isNull ? -1L : (i.getLong(0)); if (isNull) { rowWriter.setNullAt(0); } else { rowWriter.write(0, value); } boolean isNull1 = i.isNullAt(1); UTF8String value1 = isNull1 ? null : (i.getUTF8String(1)); if (isNull1) { rowWriter.setNullAt(1); } else { rowWriter.write(1, value1); } result.setTotalSize(holder.totalSize()); return result; }
Example #7
Source File: SparkParquetReadersNestedDataBenchmark.java From iceberg with Apache License 2.0 | 6 votes |
@Benchmark @Threads(1) public void readUsingIcebergReaderUnsafe(Blackhole blackhole) throws IOException { try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile)) .project(SCHEMA) .createReaderFunc(type -> SparkParquetReaders.buildReader(SCHEMA, type)) .build()) { Iterable<InternalRow> unsafeRows = Iterables.transform( rows, APPLY_PROJECTION.bind(SparkBenchmarkUtil.projection(SCHEMA, SCHEMA))::invoke); for (InternalRow row : unsafeRows) { blackhole.consume(row); } } }
Example #8
Source File: PartitionKey.java From iceberg with Apache License 2.0 | 6 votes |
@SuppressWarnings("unchecked") PartitionKey(PartitionSpec spec, Schema inputSchema) { this.spec = spec; List<PartitionField> fields = spec.fields(); this.size = fields.size(); this.partitionTuple = new Object[size]; this.transforms = new Transform[size]; this.accessors = (Accessor<InternalRow>[]) Array.newInstance(Accessor.class, size); Schema schema = spec.schema(); Map<Integer, Accessor<InternalRow>> newAccessors = buildAccessors(inputSchema); for (int i = 0; i < size; i += 1) { PartitionField field = fields.get(i); Accessor<InternalRow> accessor = newAccessors.get(field.sourceId()); if (accessor == null) { throw new RuntimeException( "Cannot build accessor for field: " + schema.findField(field.sourceId())); } this.accessors[i] = accessor; this.transforms[i] = field.transform(); } }
Example #9
Source File: SparkParquetReadersNestedDataBenchmark.java From iceberg with Apache License 2.0 | 6 votes |
@Benchmark @Threads(1) public void readUsingSparkReader(Blackhole blackhole) throws IOException { StructType sparkSchema = SparkSchemaUtil.convert(SCHEMA); try (CloseableIterable<InternalRow> rows = Parquet.read(Files.localInput(dataFile)) .project(SCHEMA) .readSupport(new ParquetReadSupport()) .set("org.apache.spark.sql.parquet.row.requested_schema", sparkSchema.json()) .set("spark.sql.parquet.binaryAsString", "false") .set("spark.sql.parquet.int96AsTimestamp", "false") .callInit() .build()) { for (InternalRow row : rows) { blackhole.consume(row); } } }
Example #10
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 6 votes |
@Test public void testUnpartitionedIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, EqualTo.apply("id", i)); List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } }
Example #11
Source File: Writer.java From iceberg with Apache License 2.0 | 6 votes |
@Override public void write(InternalRow row) throws IOException { key.partition(row); if (!key.equals(currentKey)) { closeCurrent(); if (completedPartitions.contains(key)) { // if rows are not correctly grouped, detect and fail the write PartitionKey existingKey = Iterables.find(completedPartitions, key::equals, null); LOG.warn("Duplicate key: {} == {}", existingKey, key); throw new IllegalStateException("Already closed file for partition: " + key.toPath()); } this.currentKey = key.copy(); this.currentPath = outputPathFunc.apply(currentKey); OutputFile file = HadoopOutputFile.fromPath(currentPath, conf); this.currentAppender = factory.newAppender(file, format); } currentAppender.add(row); }
Example #12
Source File: BigQueryDataSourceReader.java From spark-bigquery-connector with Apache License 2.0 | 6 votes |
@Override public List<InputPartition<InternalRow>> planInputPartitions() { if (schema.map(StructType::isEmpty).orElse(false)) { // create empty projection return createEmptyProjectionPartitions(); } ImmutableList<String> selectedFields = schema .map(requiredSchema -> ImmutableList.copyOf(requiredSchema.fieldNames())) .orElse(ImmutableList.of()); Optional<String> filter = emptyIfNeeded(SparkFilterUtils.getCompiledFilter( readSessionCreatorConfig.getReadDataFormat(), globalFilter, pushedFilters)); ReadSessionResponse readSessionResponse = readSessionCreator.create( tableId, selectedFields, filter, readSessionCreatorConfig.getMaxParallelism()); ReadSession readSession = readSessionResponse.getReadSession(); return readSession.getStreamsList().stream() .map(stream -> new BigQueryInputPartition( bigQueryReadClientFactory, stream.getName(), readSessionCreatorConfig.getMaxReadRowsRetries(), createConverter(selectedFields, readSessionResponse))) .collect(Collectors.toList()); }
Example #13
Source File: SparkParquetReaders.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected GenericInternalRow newStructData(InternalRow reuse) { if (reuse instanceof GenericInternalRow) { return (GenericInternalRow) reuse; } else { return new GenericInternalRow(numFields); } }
Example #14
Source File: PartitionKey.java From iceberg with Apache License 2.0 | 5 votes |
@SuppressWarnings("unchecked") void partition(InternalRow row) { for (int i = 0; i < partitionTuple.length; i += 1) { Transform<Object, Object> transform = transforms[i]; partitionTuple[i] = transform.apply(accessors[i].get(row)); } }
Example #15
Source File: PartitionKey.java From iceberg with Apache License 2.0 | 5 votes |
private static Accessor<InternalRow> newAccessor(int position, boolean isOptional, Types.StructType type, Accessor<InternalRow> accessor) { int size = type.fields().size(); if (isOptional) { // the wrapped position handles null layers return new WrappedPositionAccessor(position, size, accessor); } else if (accessor.getClass() == PositionAccessor.class) { return new Position2Accessor(position, size, (PositionAccessor) accessor); } else if (accessor instanceof Position2Accessor) { return new Position3Accessor(position, size, (Position2Accessor) accessor); } else { return new WrappedPositionAccessor(position, size, accessor); } }
Example #16
Source File: PartitionKey.java From iceberg with Apache License 2.0 | 5 votes |
private static Accessor<InternalRow> newAccessor(int position, Type type) { switch (type.typeId()) { case STRING: return new StringAccessor(position, SparkSchemaUtil.convert(type)); case DECIMAL: return new DecimalAccessor(position, SparkSchemaUtil.convert(type)); case BINARY: return new BytesAccessor(position, SparkSchemaUtil.convert(type)); default: return new PositionAccessor(position, SparkSchemaUtil.convert(type)); } }
Example #17
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 5 votes |
private CloseableIterable<InternalRow> newDataIterable(DataTask task, Schema readSchema) { StructInternalRow row = new StructInternalRow(tableSchema.asStruct()); CloseableIterable<InternalRow> asSparkRows = CloseableIterable.transform( task.asDataTask().rows(), row::setStruct); return CloseableIterable.transform( asSparkRows, APPLY_PROJECTION.bind(projection(readSchema, tableSchema))::invoke); }
Example #18
Source File: RowDataReader.java From iceberg with Apache License 2.0 | 5 votes |
private CloseableIterable<InternalRow> newOrcIterable( InputFile location, FileScanTask task, Schema readSchema, Map<Integer, ?> idToConstant) { return ORC.read(location) .project(readSchema) .split(task.start(), task.length()) .createReaderFunc(readOrcSchema -> new SparkOrcReader(readSchema, readOrcSchema, idToConstant)) .filter(task.residual()) .caseSensitive(caseSensitive) .build(); }
Example #19
Source File: PartitionKey.java From iceberg with Apache License 2.0 | 5 votes |
@Override public Object get(InternalRow row) { if (row.isNullAt(position())) { return null; } return ByteBuffer.wrap((byte[]) row.get(position(), type())); }
Example #20
Source File: PartitionKey.java From iceberg with Apache License 2.0 | 5 votes |
@Override public Object get(InternalRow row) { if (row.isNullAt(p)) { return null; } return row.get(p, type); }
Example #21
Source File: PartitionKey.java From iceberg with Apache License 2.0 | 5 votes |
@Override public Object get(InternalRow row) { if (row.isNullAt(position())) { return null; } return row.get(position(), type()).toString(); }
Example #22
Source File: SparkValueReaders.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected InternalRow reuseOrCreate(Object reuse) { if (reuse instanceof GenericInternalRow && ((GenericInternalRow) reuse).numFields() == numFields) { return (InternalRow) reuse; } return new GenericInternalRow(numFields); }
Example #23
Source File: Reader.java From iceberg with Apache License 2.0 | 5 votes |
private CloseableIterable<InternalRow> newAvroIterable(InputFile location, FileScanTask task, Schema readSchema) { return Avro.read(location) .reuseContainers() .project(readSchema) .split(task.start(), task.length()) .createReaderFunc(SparkAvroReader::new) .build(); }
Example #24
Source File: SparkParquetReaders.java From iceberg with Apache License 2.0 | 5 votes |
@Override protected GenericInternalRow newStructData(InternalRow reuse) { if (reuse instanceof GenericInternalRow) { return (GenericInternalRow) reuse; } else { return new GenericInternalRow(numFields); } }
Example #25
Source File: Writer.java From iceberg with Apache License 2.0 | 5 votes |
PartitionedWriter(PartitionSpec spec, FileFormat format, Configuration conf, AppenderFactory<InternalRow> factory, Function<PartitionKey, Path> outputPathFunc) { this.spec = spec; this.format = format; this.conf = conf; this.factory = factory; this.outputPathFunc = outputPathFunc; this.key = new PartitionKey(spec); }
Example #26
Source File: PartitionKey.java From iceberg with Apache License 2.0 | 5 votes |
private static Accessor<InternalRow> newAccessor(int p, Type type) { switch (type.typeId()) { case STRING: return new StringAccessor(p, convert(type)); case DECIMAL: return new DecimalAccessor(p, convert(type)); default: return new PositionAccessor(p, convert(type)); } }
Example #27
Source File: TestDataFrameWrites.java From iceberg with Apache License 2.0 | 5 votes |
private Dataset<Row> createDataset(List<Record> records, Schema schema) throws IOException { // this uses the SparkAvroReader to create a DataFrame from the list of records // it assumes that SparkAvroReader is correct File testFile = temp.newFile(); Assert.assertTrue("Delete should succeed", testFile.delete()); try (FileAppender<Record> writer = Avro.write(Files.localOutput(testFile)) .schema(schema) .named("test") .build()) { for (Record rec : records) { writer.add(rec); } } List<InternalRow> rows; try (AvroIterable<InternalRow> reader = Avro.read(Files.localInput(testFile)) .createReaderFunc(SparkAvroReader::new) .project(schema) .build()) { rows = Lists.newArrayList(reader); } // make sure the dataframe matches the records before moving on for (int i = 0; i < records.size(); i += 1) { assertEqualsUnsafe(schema.asStruct(), records.get(i), rows.get(i)); } JavaRDD<InternalRow> rdd = sc.parallelize(rows); return spark.internalCreateDataFrame(JavaRDD.toRDD(rdd), convert(schema), false); }
Example #28
Source File: ColumnarBatch.java From spliceengine with GNU Affero General Public License v3.0 | 5 votes |
/** * Returns an iterator over the rows in this batch. */ public Iterator<InternalRow> rowIterator() { final int maxRows = numRows; final MutableColumnarRow row = new MutableColumnarRow(columns); return new Iterator<InternalRow>() { int rowId = 0; @Override public boolean hasNext() { return rowId < maxRows; } @Override public InternalRow next() { if (rowId >= maxRows) { throw new NoSuchElementException(); } row.rowId = rowId++; return row; } @Override public void remove() { throw new UnsupportedOperationException(); } }; }
Example #29
Source File: SparkValueWriters.java From iceberg with Apache License 2.0 | 5 votes |
@Override public void write(InternalRow row, Encoder encoder) throws IOException { for (int i = 0; i < types.length; i += 1) { if (row.isNullAt(i)) { writers[i].write(null, encoder); } else { write(row, i, writers[i], encoder); } } }
Example #30
Source File: TestFilteredScan.java From iceberg with Apache License 2.0 | 5 votes |
@Test public void testUnpartitionedCaseInsensitiveIDFilters() { DataSourceOptions options = new DataSourceOptions(ImmutableMap.of( "path", unpartitioned.toString()) ); // set spark.sql.caseSensitive to false String caseSensitivityBeforeTest = TestFilteredScan.spark.conf().get("spark.sql.caseSensitive"); TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", "false"); try { IcebergSource source = new IcebergSource(); for (int i = 0; i < 10; i += 1) { DataSourceReader reader = source.createReader(options); pushFilters(reader, EqualTo.apply("ID", i)); // note lower(ID) == lower(id), so there must be a match List<InputPartition<InternalRow>> tasks = reader.planInputPartitions(); Assert.assertEquals("Should only create one task for a small file", 1, tasks.size()); // validate row filtering assertEqualsSafe(SCHEMA.asStruct(), expected(i), read(unpartitioned.toString(), "id = " + i)); } } finally { // return global conf to previous state TestFilteredScan.spark.conf().set("spark.sql.caseSensitive", caseSensitivityBeforeTest); } }