org.apache.spark.sql.catalyst.expressions.UnsafeProjection Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.UnsafeProjection.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LocalTableScanExec.scala From drizzle-spark with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 2
Source File: StarryLocalTableScanExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.{RDD, StarryRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class StarryLocalTableScanExec( tableName: String, output: Seq[Attribute], @transient rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) @transient private lazy val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val rdd = new StarryRDD(sparkContext, tableName, unsafeRows) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.length) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.length) taken } }
Example 3
Source File: NullableColumnBuilderSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 4
Source File: NullableColumnAccessorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 5
Source File: GenerateUnsafeRowJoinerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) //简单的固定宽度类型 test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } //随机化的固定宽度类型 test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } //简单变量宽度类型 test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } //随机变量宽度类型 test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val schema1 = RandomDataGenerator.randomSchema(numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. //创建从外部行转换为内部行和UnsafeRows所需的转换器 val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. //创建输入行,将它们转换成UnsafeRows val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 6
Source File: ComplexDataSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import scala.collection._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BoundReference, GenericInternalRow, SpecificInternalRow, UnsafeMapData, UnsafeProjection} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType} import org.apache.spark.unsafe.types.UTF8String class ComplexDataSuite extends SparkFunSuite { def utf8(str: String): UTF8String = UTF8String.fromString(str) test("inequality tests for MapData") { // test data val testMap1 = Map(utf8("key1") -> 1) val testMap2 = Map(utf8("key1") -> 1, utf8("key2") -> 2) val testMap3 = Map(utf8("key1") -> 1) val testMap4 = Map(utf8("key1") -> 1, utf8("key2") -> 2) // ArrayBasedMapData val testArrayMap1 = ArrayBasedMapData(testMap1.toMap) val testArrayMap2 = ArrayBasedMapData(testMap2.toMap) val testArrayMap3 = ArrayBasedMapData(testMap3.toMap) val testArrayMap4 = ArrayBasedMapData(testMap4.toMap) assert(testArrayMap1 !== testArrayMap3) assert(testArrayMap2 !== testArrayMap4) // UnsafeMapData val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType))) val row = new GenericInternalRow(1) def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = { row.update(0, map) val unsafeRow = unsafeConverter.apply(row) unsafeRow.getMap(0).copy } assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3)) assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4)) } test("GenericInternalRow.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val genericRow = new GenericInternalRow(Array[Any](unsafeRow.getUTF8String(0))) val copiedGenericRow = genericRow.copy() assert(copiedGenericRow.getString(0) == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied internal row should not be changed externally. assert(copiedGenericRow.getString(0) == "a") } test("SpecificMutableRow.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val mutableRow = new SpecificInternalRow(Seq(StringType)) mutableRow(0) = unsafeRow.getUTF8String(0) val copiedMutableRow = mutableRow.copy() assert(copiedMutableRow.getString(0) == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied internal row should not be changed externally. assert(copiedMutableRow.getString(0) == "a") } test("GenericArrayData.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val genericArray = new GenericArrayData(Array[Any](unsafeRow.getUTF8String(0))) val copiedGenericArray = genericArray.copy() assert(copiedGenericArray.getUTF8String(0).toString == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied array data should not be changed externally. assert(copiedGenericArray.getUTF8String(0).toString == "a") } test("copy on nested complex type") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val arrayOfRow = new GenericArrayData(Array[Any](InternalRow(unsafeRow.getUTF8String(0)))) val copied = arrayOfRow.copy() assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied data should not be changed externally. assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a") } }
Example 7
Source File: LocalTableScanExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], @transient rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) @transient private lazy val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 8
Source File: ObjectAggregationMap.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import java.{util => ju} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.internal.config import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter def dumpToExternalSorter( groupingAttributes: Seq[Attribute], aggregateFunctions: Seq[AggregateFunction]): UnsafeKVExternalSorter = { val aggBufferAttributes = aggregateFunctions.flatMap(_.aggBufferAttributes) val sorter = new UnsafeKVExternalSorter( StructType.fromAttributes(groupingAttributes), StructType.fromAttributes(aggBufferAttributes), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, TaskContext.get().taskMemoryManager().pageSizeBytes, SparkEnv.get.conf.get(config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD), null ) val mapIterator = iterator val unsafeAggBufferProjection = UnsafeProjection.create(aggBufferAttributes.map(_.dataType).toArray) while (mapIterator.hasNext) { val entry = mapIterator.next() aggregateFunctions.foreach { case agg: TypedImperativeAggregate[_] => agg.serializeAggregateBufferInPlace(entry.aggregationBuffer) case _ => } sorter.insertKV( entry.groupingKey, unsafeAggBufferProjection(entry.aggregationBuffer) ) } hashMap.clear() sorter } def clear(): Unit = { hashMap.clear() } } // Stores the grouping key and aggregation buffer class AggregationBufferEntry(var groupingKey: UnsafeRow, var aggregationBuffer: InternalRow)
Example 9
Source File: EventTimeWatermarkExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 10
Source File: NullableColumnBuilderSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 11
Source File: NullableColumnAccessorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 12
Source File: LocalTableScanExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow], override val user: String) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 13
Source File: StarryTakeOrderedAndProjectExec.scala From starry with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.exchange import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.LazilyGeneratedOrdering import org.apache.spark.sql.catalyst.expressions.{Attribute, NamedExpression, SortOrder, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, SinglePartition} import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.util.Utils case class StarryTakeOrderedAndProjectExec( limit: Int, sortOrder: Seq[SortOrder], projectList: Seq[NamedExpression], child: SparkPlan) extends UnaryExecNode { override def output: Seq[Attribute] = { projectList.map(_.toAttribute) } override def executeCollect(): Array[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val data = child.execute().map(_.copy()).takeOrdered(limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) data.map(r => proj(r).copy()) } else { data } } protected override def doExecute(): RDD[InternalRow] = { val ord = new LazilyGeneratedOrdering(sortOrder, child.output) val localTopK: RDD[InternalRow] = { child.execute().map(_.copy()).mapPartitions { iter => org.apache.spark.util.collection.Utils.takeOrdered(iter, limit)(ord) } } localTopK.mapPartitions { iter => val topK = org.apache.spark.util.collection.Utils.takeOrdered(iter.map(_.copy()), limit)(ord) if (projectList != child.output) { val proj = UnsafeProjection.create(projectList, child.output) topK.map(r => proj(r)) } else { topK } } } override def outputOrdering: Seq[SortOrder] = sortOrder override def outputPartitioning: Partitioning = SinglePartition override def simpleString: String = { val orderByString = Utils.truncatedString(sortOrder, "[", ",", "]") val outputString = Utils.truncatedString(output, "[", ",", "]") s"TakeOrderedAndProject(limit=$limit, orderBy=$orderByString, output=$outputString)" } }
Example 14
Source File: KinesisWriteTask.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import java.nio.ByteBuffer import com.amazonaws.services.kinesis.producer.{KinesisProducer, UserRecordResult} import com.google.common.util.concurrent.{FutureCallback, Futures} import org.apache.spark.internal.Logging import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Cast, UnsafeProjection} import org.apache.spark.sql.types.{BinaryType, StringType} private[kinesis] class KinesisWriteTask(producerConfiguration: Map[String, String], inputSchema: Seq[Attribute]) extends Logging { private var producer: KinesisProducer = _ private val projection = createProjection private val streamName = producerConfiguration.getOrElse( KinesisSourceProvider.SINK_STREAM_NAME_KEY, "") def execute(iterator: Iterator[InternalRow]): Unit = { producer = CachedKinesisProducer.getOrCreate(producerConfiguration) while (iterator.hasNext) { val currentRow = iterator.next() val projectedRow = projection(currentRow) val partitionKey = projectedRow.getString(0) val data = projectedRow.getBinary(1) sendData(partitionKey, data) } } def sendData(partitionKey: String, data: Array[Byte]): String = { var sentSeqNumbers = new String val future = producer.addUserRecord(streamName, partitionKey, ByteBuffer.wrap(data)) val kinesisCallBack = new FutureCallback[UserRecordResult]() { override def onFailure(t: Throwable): Unit = { logError(s"Writing to $streamName failed due to ${t.getCause}") } override def onSuccess(result: UserRecordResult): Unit = { val shardId = result.getShardId sentSeqNumbers = result.getSequenceNumber } } Futures.addCallback(future, kinesisCallBack) producer.flushSync() sentSeqNumbers } def close(): Unit = { if (producer != null) { producer.flush() producer = null } } private def createProjection: UnsafeProjection = { val partitionKeyExpression = inputSchema .find(_.name == KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException("Required attribute " + s"'${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME}' not found")) partitionKeyExpression.dataType match { case StringType | BinaryType => // ok case t => throw new IllegalStateException(s"${KinesisWriter.PARTITION_KEY_ATTRIBUTE_NAME} " + "attribute type must be a String or BinaryType") } val dataExpression = inputSchema.find(_.name == KinesisWriter.DATA_ATTRIBUTE_NAME).getOrElse( throw new IllegalStateException("Required attribute " + s"'${KinesisWriter.DATA_ATTRIBUTE_NAME}' not found") ) dataExpression.dataType match { case StringType | BinaryType => // ok case t => throw new IllegalStateException(s"${KinesisWriter.DATA_ATTRIBUTE_NAME} " + "attribute type must be a String or BinaryType") } UnsafeProjection.create( Seq(Cast(partitionKeyExpression, StringType), Cast(dataExpression, StringType)), inputSchema) } }
Example 15
Source File: HashSetRowIterator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.storage.set.hashset import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{BufferHolder, UnsafeRowWriter} import org.apache.spark.sql.catalyst.expressions.{SpecificMutableRow, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.types.{IntegerType, StructField, StructType} class ObjectHashSetRowIterator(set: ObjectHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { rawIter.next() } } class IntKeysHashSetRowIterator(set: IntKeysHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() val uRow = new UnsafeRow() val bufferHolder = new BufferHolder() val rowWriter = new UnsafeRowWriter() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { bufferHolder.reset() rowWriter.initialize(bufferHolder, 1) rowWriter.write(0, rawIter.next()) uRow.pointTo(bufferHolder.buffer, 1, bufferHolder.totalSize()) uRow } } class LongKeysHashSetRowIterator(set: LongKeysHashSet) extends Iterator[InternalRow] { val rawIter = set.iterator() val numFields = set.schemaInfo.arity val uRow = new UnsafeRow() val bufferHolder = new BufferHolder() val rowWriter = new UnsafeRowWriter() override final def hasNext(): Boolean = { rawIter.hasNext } override final def next(): InternalRow = { bufferHolder.reset() rowWriter.initialize(bufferHolder, numFields) val value = rawIter.nextLong() if (numFields == 2) { rowWriter.write(0, (value >> 32).toInt) rowWriter.write(1, value.toInt) } else { rowWriter.write(0, value) } uRow.pointTo(bufferHolder.buffer, numFields, bufferHolder.totalSize()) uRow } } object HashSetRowIterator { def create(set: HashSet): Iterator[InternalRow] = { set match { //case set: UnsafeFixedWidthSet => set.iterator().asScala case set: IntKeysHashSet => new IntKeysHashSetRowIterator(set) case set: LongKeysHashSet => new LongKeysHashSetRowIterator(set) case set: ObjectHashSet => new ObjectHashSetRowIterator(set) } } }
Example 16
Source File: GenerateUnsafeRowJoinerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val schema1 = RandomDataGenerator.randomSchema(numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 17
Source File: NullableColumnBuilderSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericMutableRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 18
Source File: NullableColumnAccessorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericMutableRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 19
Source File: GenomicIntervalStrategy.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.utvf import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{DataFrame, GenomicInterval, SparkSession, Strategy} import org.apache.spark.unsafe.types.UTF8String case class GIntervalRow(contigName: String, start: Int, end: Int) class GenomicIntervalStrategy( spark: SparkSession) extends Strategy with Serializable { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case GenomicInterval(contigName, start, end,output) => GenomicIntervalPlan(plan,spark,GIntervalRow(contigName,start,end),output) :: Nil case _ => Nil } } case class GenomicIntervalPlan(plan: LogicalPlan, spark: SparkSession,interval:GIntervalRow, output: Seq[Attribute]) extends SparkPlan with Serializable { def doExecute(): org.apache.spark.rdd.RDD[InternalRow] = { import spark.implicits._ lazy val genomicInterval = spark.createDataset(Seq(interval)) genomicInterval .rdd .map(r=>{ val proj = UnsafeProjection.create(schema) proj.apply(InternalRow.fromSeq(Seq(UTF8String.fromString(r.contigName),r.start,r.end))) } ) } def children: Seq[SparkPlan] = Nil }
Example 20
Source File: VCFOutputFormatter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.vcf import java.io.InputStream import htsjdk.samtools.ValidationStringency import htsjdk.tribble.readers.{SynchronousLineReader, LineIteratorImpl => HtsjdkLineIteratorImpl} import htsjdk.variant.vcf.{VCFCodec, VCFHeader} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import io.projectglow.common.GlowLogging import io.projectglow.transformers.pipe.{OutputFormatter, OutputFormatterFactory} class VCFOutputFormatter(stringency: ValidationStringency) extends OutputFormatter with GlowLogging { override def makeIterator(stream: InputStream): Iterator[Any] = { val codec = new VCFCodec val lineIterator = new HtsjdkLineIteratorImpl(new SynchronousLineReader(stream)) if (!lineIterator.hasNext) { return Iterator.empty } val header = codec.readActualHeader(lineIterator).asInstanceOf[VCFHeader] val schema = VCFSchemaInferrer.inferSchema(true, true, header) val converter = new VariantContextToInternalRowConverter(header, schema, stringency) val projection = UnsafeProjection.create(schema) val internalRowIter: Iterator[InternalRow] = new Iterator[InternalRow] { private var nextRecord: InternalRow = _ private def readNextVc(): Unit = { while (nextRecord == null && lineIterator.hasNext) { val decoded = codec.decode(lineIterator.next()) if (decoded != null) { nextRecord = projection(converter.convertRow(decoded, isSplit = false)).copy() } } } override def hasNext: Boolean = { readNextVc() nextRecord != null } override def next(): InternalRow = { if (hasNext) { val ret = nextRecord nextRecord = null ret } else { throw new NoSuchElementException("Iterator is empty") } } } Iterator(schema) ++ internalRowIter } } class VCFOutputFormatterFactory extends OutputFormatterFactory { override def name: String = "vcf" override def makeOutputFormatter(options: Map[String, String]): OutputFormatter = { val stringency = VCFOptionParser.getValidationStringency(options) new VCFOutputFormatter(stringency) } }
Example 21
Source File: NullableColumnAccessorSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 22
Source File: GenerateUnsafeRowJoinerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val random = new Random() val schema1 = RandomDataGenerator.randomSchema(random, numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(random, numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 23
Source File: NullableColumnBuilderSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 24
Source File: NullableColumnAccessorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 25
Source File: ArrayDataIndexedSeqSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.encoders.{ExamplePointUDT, RowEncoder} import org.apache.spark.sql.catalyst.expressions.{FromUnsafeProjection, UnsafeArrayData, UnsafeProjection} import org.apache.spark.sql.types._ class ArrayDataIndexedSeqSuite extends SparkFunSuite { private def compArray(arrayData: ArrayData, elementDt: DataType, array: Array[Any]): Unit = { assert(arrayData.numElements == array.length) array.zipWithIndex.map { case (e, i) => if (e != null) { elementDt match { // For NaN, etc. case FloatType | DoubleType => assert(arrayData.get(i, elementDt).equals(e)) case _ => assert(arrayData.get(i, elementDt) === e) } } else { assert(arrayData.isNullAt(i)) } } val seq = arrayData.toSeq[Any](elementDt) array.zipWithIndex.map { case (e, i) => if (e != null) { elementDt match { // For Nan, etc. case FloatType | DoubleType => assert(seq(i).equals(e)) case _ => assert(seq(i) === e) } } else { assert(seq(i) == null) } } intercept[IndexOutOfBoundsException] { seq(-1) }.getMessage().contains("must be between 0 and the length of the ArrayData.") intercept[IndexOutOfBoundsException] { seq(seq.length) }.getMessage().contains("must be between 0 and the length of the ArrayData.") } private def testArrayData(): Unit = { val elementTypes = Seq(BooleanType, ByteType, ShortType, IntegerType, LongType, FloatType, DoubleType, DecimalType.USER_DEFAULT, StringType, BinaryType, DateType, TimestampType, CalendarIntervalType, new ExamplePointUDT()) val arrayTypes = elementTypes.flatMap { elementType => Seq(ArrayType(elementType, containsNull = false), ArrayType(elementType, containsNull = true)) } val random = new Random(100) arrayTypes.foreach { dt => val schema = StructType(StructField("col_1", dt, nullable = false) :: Nil) val row = RandomDataGenerator.randomRow(random, schema) val rowConverter = RowEncoder(schema) val internalRow = rowConverter.toRow(row) val unsafeRowConverter = UnsafeProjection.create(schema) val safeRowConverter = FromUnsafeProjection(schema) val unsafeRow = unsafeRowConverter(internalRow) val safeRow = safeRowConverter(unsafeRow) val genericArrayData = safeRow.getArray(0).asInstanceOf[GenericArrayData] val unsafeArrayData = unsafeRow.getArray(0).asInstanceOf[UnsafeArrayData] val elementType = dt.elementType test("ArrayDataIndexedSeq - UnsafeArrayData - " + dt.toString) { compArray(unsafeArrayData, elementType, unsafeArrayData.toArray[Any](elementType)) } test("ArrayDataIndexedSeq - GenericArrayData - " + dt.toString) { compArray(genericArrayData, elementType, genericArrayData.toArray[Any](elementType)) } } } testArrayData() }
Example 26
Source File: LocalTableScanExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], @transient rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) @transient private lazy val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 27
Source File: ObjectAggregationMap.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import java.{util => ju} import org.apache.spark.{SparkEnv, TaskContext} import org.apache.spark.internal.config import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection, UnsafeRow} import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateFunction, TypedImperativeAggregate} import org.apache.spark.sql.execution.UnsafeKVExternalSorter import org.apache.spark.sql.types.StructType import org.apache.spark.util.collection.unsafe.sort.UnsafeExternalSorter def dumpToExternalSorter( groupingAttributes: Seq[Attribute], aggregateFunctions: Seq[AggregateFunction]): UnsafeKVExternalSorter = { val aggBufferAttributes = aggregateFunctions.flatMap(_.aggBufferAttributes) val sorter = new UnsafeKVExternalSorter( StructType.fromAttributes(groupingAttributes), StructType.fromAttributes(aggBufferAttributes), SparkEnv.get.blockManager, SparkEnv.get.serializerManager, TaskContext.get().taskMemoryManager().pageSizeBytes, SparkEnv.get.conf.get(config.SHUFFLE_SPILL_NUM_ELEMENTS_FORCE_SPILL_THRESHOLD), null ) val mapIterator = iterator val unsafeAggBufferProjection = UnsafeProjection.create(aggBufferAttributes.map(_.dataType).toArray) while (mapIterator.hasNext) { val entry = mapIterator.next() aggregateFunctions.foreach { case agg: TypedImperativeAggregate[_] => agg.serializeAggregateBufferInPlace(entry.aggregationBuffer) case _ => } sorter.insertKV( entry.groupingKey, unsafeAggBufferProjection(entry.aggregationBuffer) ) } hashMap.clear() sorter } def clear(): Unit = { hashMap.clear() } } // Stores the grouping key and aggregation buffer class AggregationBufferEntry(var groupingKey: UnsafeRow, var aggregationBuffer: InternalRow)
Example 28
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 29
Source File: EventTimeWatermarkExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 30
Source File: NullableColumnBuilderSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 31
Source File: EventTimeWatermarkExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { override def user: String = child.user val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 32
Source File: GenerateUnsafeRowJoinerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val random = new Random() val schema1 = RandomDataGenerator.randomSchema(random, numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(random, numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 33
Source File: LocalTableScanExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.metric.SQLMetrics case class LocalTableScanExec( output: Seq[Attribute], rows: Seq[InternalRow]) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) private val unsafeRows: Array[InternalRow] = { if (rows.isEmpty) { Array.empty } else { val proj = UnsafeProjection.create(output, output) rows.map(r => proj(r).copy()).toArray } } private lazy val numParallelism: Int = math.min(math.max(unsafeRows.length, 1), sqlContext.sparkContext.defaultParallelism) private lazy val rdd = sqlContext.sparkContext.parallelize(unsafeRows, numParallelism) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.map { r => numOutputRows += 1 r } } override protected def stringArgs: Iterator[Any] = { if (rows.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def executeCollect(): Array[InternalRow] = { longMetric("numOutputRows").add(unsafeRows.size) unsafeRows } override def executeTake(limit: Int): Array[InternalRow] = { val taken = unsafeRows.take(limit) longMetric("numOutputRows").add(taken.size) taken } }
Example 34
Source File: EventTimeWatermarkExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 35
Source File: NullableColumnBuilderSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 36
Source File: NullableColumnAccessorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 37
Source File: SparkUnsafeRowReadSuport.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.carbondata.execution.datasources.readsupport import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types.StructType import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport class SparkUnsafeRowReadSuport(requiredSchema: StructType) extends CarbonReadSupport[InternalRow] { private val unsafeProjection = UnsafeProjection.create(requiredSchema) override def initialize(carbonColumns: Array[CarbonColumn], carbonTable: CarbonTable): Unit = { } override def readRow(data: Array[AnyRef]): InternalRow = { unsafeProjection(new GenericInternalRow(data.asInstanceOf[Array[Any]])) } override def close(): Unit = { // Nothing to close } }
Example 38
Source File: GenerateUnsafeRowJoinerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val random = new Random() val schema1 = RandomDataGenerator.randomSchema(random, numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(random, numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 39
Source File: SnowflakePlan.scala From spark-snowflake with Apache License 2.0 | 5 votes |
package net.snowflake.spark.snowflake.pushdowns import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.{StructField, StructType} case class SnowflakePlan(output: Seq[Attribute], rdd: RDD[InternalRow]) extends SparkPlan { override def children: Seq[SparkPlan] = Nil protected override def doExecute(): RDD[InternalRow] = { val schema = StructType( output.map(attr => StructField(attr.name, attr.dataType, attr.nullable)) ) rdd.mapPartitions { iter => val project = UnsafeProjection.create(schema) iter.map(project) } } }