org.apache.spark.sql.catalyst.expressions.GenericInternalRow Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.expressions.GenericInternalRow.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: NullableColumnBuilderSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 2
Source File: MatrixUDT.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // the dense matrix is built by numRows, numCols, values and isTransposed, all of which are // set as not nullable, except values since in the future, support for binary matrices might // be added for which values are not needed. // the sparse matrix needs colPtrs and rowIndices, which are set as // null, while building the dense matrix. StructType(Seq( StructField("type", ByteType, nullable = false), StructField("numRows", IntegerType, nullable = false), StructField("numCols", IntegerType, nullable = false), StructField("colPtrs", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("rowIndices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("isTransposed", BooleanType, nullable = false) )) } override def serialize(obj: Matrix): InternalRow = { val row = new GenericInternalRow(7) obj match { case sm: SparseMatrix => row.setByte(0, 0) row.setInt(1, sm.numRows) row.setInt(2, sm.numCols) row.update(3, UnsafeArrayData.fromPrimitiveArray(sm.colPtrs)) row.update(4, UnsafeArrayData.fromPrimitiveArray(sm.rowIndices)) row.update(5, UnsafeArrayData.fromPrimitiveArray(sm.values)) row.setBoolean(6, sm.isTransposed) case dm: DenseMatrix => row.setByte(0, 1) row.setInt(1, dm.numRows) row.setInt(2, dm.numCols) row.setNullAt(3) row.setNullAt(4) row.update(5, UnsafeArrayData.fromPrimitiveArray(dm.values)) row.setBoolean(6, dm.isTransposed) } row } override def deserialize(datum: Any): Matrix = { datum match { case row: InternalRow => require(row.numFields == 7, s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7") val tpe = row.getByte(0) val numRows = row.getInt(1) val numCols = row.getInt(2) val values = row.getArray(5).toDoubleArray() val isTransposed = row.getBoolean(6) tpe match { case 0 => val colPtrs = row.getArray(3).toIntArray() val rowIndices = row.getArray(4).toIntArray() new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) case 1 => new DenseMatrix(numRows, numCols, values, isTransposed) } } } override def userClass: Class[Matrix] = classOf[Matrix] override def equals(o: Any): Boolean = { o match { case v: MatrixUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[MatrixUDT].getName.hashCode() override def typeName: String = "matrix" override def pyUDT: String = "pyspark.ml.linalg.MatrixUDT" private[spark] override def asNullable: MatrixUDT = this }
Example 3
Source File: ColumnStatsSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types._ //列统计测试套件 class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[BooleanColumnStats], BOOLEAN, createRow(true, false, 0)) testColumnStats(classOf[ByteColumnStats], BYTE, createRow(Byte.MaxValue, Byte.MinValue, 0)) testColumnStats(classOf[ShortColumnStats], SHORT, createRow(Short.MaxValue, Short.MinValue, 0)) testColumnStats(classOf[IntColumnStats], INT, createRow(Int.MaxValue, Int.MinValue, 0)) testColumnStats(classOf[DateColumnStats], DATE, createRow(Int.MaxValue, Int.MinValue, 0)) testColumnStats(classOf[LongColumnStats], LONG, createRow(Long.MaxValue, Long.MinValue, 0)) testColumnStats(classOf[TimestampColumnStats], TIMESTAMP, createRow(Long.MaxValue, Long.MinValue, 0)) testColumnStats(classOf[FloatColumnStats], FLOAT, createRow(Float.MaxValue, Float.MinValue, 0)) testColumnStats(classOf[DoubleColumnStats], DOUBLE, createRow(Double.MaxValue, Double.MinValue, 0)) testColumnStats(classOf[StringColumnStats], STRING, createRow(null, null, 0)) testDecimalColumnStats(createRow(null, null, 0)) def createRow(values: Any*): GenericInternalRow = new GenericInternalRow(values.toArray) //测试列统计 def testColumnStats[T <: AtomicType, U <: ColumnStats]( columnStatsClass: Class[U], columnType: NativeColumnType[T], initialStatistics: GenericInternalRow): Unit = { val columnStatsName = columnStatsClass.getSimpleName test(s"$columnStatsName: empty") { val columnStats = columnStatsClass.newInstance() columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") {//非空 import org.apache.spark.sql.columnar.ColumnarTestUtils._ val columnStats = columnStatsClass.newInstance() val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } //测试十进制列统计 def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats]( initialStatistics: GenericInternalRow): Unit = { val columnStatsName = classOf[FixedDecimalColumnStats].getSimpleName val columnType = FIXED_DECIMAL(15, 10) test(s"$columnStatsName: empty") { val columnStats = new FixedDecimalColumnStats(15, 10) columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") {//非空 import org.apache.spark.sql.columnar.ColumnarTestUtils._ val columnStats = new FixedDecimalColumnStats(15, 10) val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } }
Example 4
Source File: ExtraStrategiesSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String //快速操作 case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } //Nil是一个空的List override def children: Seq[SparkPlan] = Nil } //测试策略 object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 FastOperator(attr.toAttribute :: Nil) :: Nil //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 case _ => Nil } } //额外的策略集 class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") {//插入一个额外的策略 try { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sqlContext.sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 sqlContext.experimental.extraStrategies = Nil } } }
Example 5
Source File: UdtEncodedClass.scala From frameless with Apache License 2.0 | 5 votes |
package frameless import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ import org.apache.spark.sql.FramelessInternals.UserDefinedType @SQLUserDefinedType(udt = classOf[UdtEncodedClassUdt]) class UdtEncodedClass(val a: Int, val b: Array[Double]) { override def equals(other: Any): Boolean = other match { case that: UdtEncodedClass => a == that.a && java.util.Arrays.equals(b, that.b) case _ => false } override def hashCode(): Int = { val state = Seq[Any](a, b) state.map(_.hashCode()).foldLeft(0)((a, b) => 31 * a + b) } override def toString = s"UdtEncodedClass($a, $b)" } object UdtEncodedClass { implicit val udtForUdtEncodedClass = new UdtEncodedClassUdt } class UdtEncodedClassUdt extends UserDefinedType[UdtEncodedClass] { def sqlType: DataType = { StructType(Seq( StructField("a", IntegerType, nullable = false), StructField("b", ArrayType(DoubleType, containsNull = false), nullable = false) )) } def serialize(obj: UdtEncodedClass): InternalRow = { val row = new GenericInternalRow(3) row.setInt(0, obj.a) row.update(1, UnsafeArrayData.fromPrimitiveArray(obj.b)) row } def deserialize(datum: Any): UdtEncodedClass = datum match { case row: InternalRow => new UdtEncodedClass(row.getInt(0), row.getArray(1).toDoubleArray()) } def userClass: Class[UdtEncodedClass] = classOf[UdtEncodedClass] }
Example 6
Source File: RowSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, SpecificInternalRow} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String class RowSuite extends SparkFunSuite with SharedSQLContext { import testImplicits._ test("create row") { val expected = new GenericInternalRow(4) expected.setInt(0, 2147483647) expected.update(1, UTF8String.fromString("this is a string")) expected.setBoolean(2, false) expected.setNullAt(3) val actual1 = Row(2147483647, "this is a string", false, null) assert(expected.numFields === actual1.size) assert(expected.getInt(0) === actual1.getInt(0)) assert(expected.getString(1) === actual1.getString(1)) assert(expected.getBoolean(2) === actual1.getBoolean(2)) assert(expected.isNullAt(3) === actual1.isNullAt(3)) val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null)) assert(expected.numFields === actual2.size) assert(expected.getInt(0) === actual2.getInt(0)) assert(expected.getString(1) === actual2.getString(1)) assert(expected.getBoolean(2) === actual2.getBoolean(2)) assert(expected.isNullAt(3) === actual2.isNullAt(3)) } test("SpecificMutableRow.update with null") { val row = new SpecificInternalRow(Seq(IntegerType)) row(0) = null assert(row.isNullAt(0)) } test("get values by field name on Row created via .toDF") { val row = Seq((1, Seq(1))).toDF("a", "b").first() assert(row.getAs[Int]("a") === 1) assert(row.getAs[Seq[Int]]("b") === Seq(1)) intercept[IllegalArgumentException]{ row.getAs[Int]("c") } } test("float NaN == NaN") { val r1 = Row(Float.NaN) val r2 = Row(Float.NaN) assert(r1 === r2) } test("double NaN == NaN") { val r1 = Row(Double.NaN) val r2 = Row(Double.NaN) assert(r1 === r2) } test("equals and hashCode") { val r1 = Row("Hello") val r2 = Row("Hello") assert(r1 === r2) assert(r1.hashCode() === r2.hashCode()) val r3 = Row("World") assert(r3.hashCode() != r1.hashCode()) } }
Example 7
Source File: ColumnarTestUtils.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 8
Source File: NullableColumnAccessorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 9
Source File: ColumnStatsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types._ class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[BooleanColumnStats], BOOLEAN, createRow(true, false, 0)) testColumnStats(classOf[ByteColumnStats], BYTE, createRow(Byte.MaxValue, Byte.MinValue, 0)) testColumnStats(classOf[ShortColumnStats], SHORT, createRow(Short.MaxValue, Short.MinValue, 0)) testColumnStats(classOf[IntColumnStats], INT, createRow(Int.MaxValue, Int.MinValue, 0)) testColumnStats(classOf[LongColumnStats], LONG, createRow(Long.MaxValue, Long.MinValue, 0)) testColumnStats(classOf[FloatColumnStats], FLOAT, createRow(Float.MaxValue, Float.MinValue, 0)) testColumnStats(classOf[DoubleColumnStats], DOUBLE, createRow(Double.MaxValue, Double.MinValue, 0)) testColumnStats(classOf[StringColumnStats], STRING, createRow(null, null, 0)) testDecimalColumnStats(createRow(null, null, 0)) def createRow(values: Any*): GenericInternalRow = new GenericInternalRow(values.toArray) def testColumnStats[T <: AtomicType, U <: ColumnStats]( columnStatsClass: Class[U], columnType: NativeColumnType[T], initialStatistics: GenericInternalRow): Unit = { val columnStatsName = columnStatsClass.getSimpleName test(s"$columnStatsName: empty") { val columnStats = columnStatsClass.newInstance() columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ val columnStats = columnStatsClass.newInstance() val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats]( initialStatistics: GenericInternalRow): Unit = { val columnStatsName = classOf[DecimalColumnStats].getSimpleName val columnType = COMPACT_DECIMAL(15, 10) test(s"$columnStatsName: empty") { val columnStats = new DecimalColumnStats(15, 10) columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ val columnStats = new DecimalColumnStats(15, 10) val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } }
Example 10
Source File: VectorUDT.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class VectorUDT extends UserDefinedType[Vector] { override final def sqlType: StructType = _sqlType override def serialize(obj: Vector): InternalRow = { obj match { case SparseVector(size, indices, values) => val row = new GenericInternalRow(4) row.setByte(0, 0) row.setInt(1, size) row.update(2, UnsafeArrayData.fromPrimitiveArray(indices)) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row case DenseVector(values) => val row = new GenericInternalRow(4) row.setByte(0, 1) row.setNullAt(1) row.setNullAt(2) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row } } override def deserialize(datum: Any): Vector = { datum match { case row: InternalRow => require(row.numFields == 4, s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4") val tpe = row.getByte(0) tpe match { case 0 => val size = row.getInt(1) val indices = row.getArray(2).toIntArray() val values = row.getArray(3).toDoubleArray() new SparseVector(size, indices, values) case 1 => val values = row.getArray(3).toDoubleArray() new DenseVector(values) } } } override def pyUDT: String = "pyspark.ml.linalg.VectorUDT" override def userClass: Class[Vector] = classOf[Vector] override def equals(o: Any): Boolean = { o match { case v: VectorUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[VectorUDT].getName.hashCode() override def typeName: String = "vector" private[spark] override def asNullable: VectorUDT = this private[this] val _sqlType = { // type: 0 = sparse, 1 = dense // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse // vectors. The "values" field is nullable because we might want to add binary vectors later, // which uses "size" and "indices", but not "values". StructType(Seq( StructField("type", ByteType, nullable = false), StructField("size", IntegerType, nullable = true), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) } }
Example 11
Source File: VectorUDT.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class VectorUDT extends UserDefinedType[Vector] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse // vectors. The "values" field is nullable because we might want to add binary vectors later, // which uses "size" and "indices", but not "values". StructType(Seq( StructField("type", ByteType, nullable = false), StructField("size", IntegerType, nullable = true), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) } override def serialize(obj: Vector): InternalRow = { obj match { case SparseVector(size, indices, values) => val row = new GenericInternalRow(4) row.setByte(0, 0) row.setInt(1, size) row.update(2, UnsafeArrayData.fromPrimitiveArray(indices)) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row case DenseVector(values) => val row = new GenericInternalRow(4) row.setByte(0, 1) row.setNullAt(1) row.setNullAt(2) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row } } override def deserialize(datum: Any): Vector = { datum match { case row: InternalRow => require(row.numFields == 4, s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4") val tpe = row.getByte(0) tpe match { case 0 => val size = row.getInt(1) val indices = row.getArray(2).toIntArray() val values = row.getArray(3).toDoubleArray() new SparseVector(size, indices, values) case 1 => val values = row.getArray(3).toDoubleArray() new DenseVector(values) } } } override def pyUDT: String = "pyspark.ml.linalg.VectorUDT" override def userClass: Class[Vector] = classOf[Vector] override def equals(o: Any): Boolean = { o match { case v: VectorUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[VectorUDT].getName.hashCode() override def typeName: String = "vector" private[spark] override def asNullable: VectorUDT = this }
Example 12
Source File: MatrixUDT.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // the dense matrix is built by numRows, numCols, values and isTransposed, all of which are // set as not nullable, except values since in the future, support for binary matrices might // be added for which values are not needed. // the sparse matrix needs colPtrs and rowIndices, which are set as // null, while building the dense matrix. StructType(Seq( StructField("type", ByteType, nullable = false), StructField("numRows", IntegerType, nullable = false), StructField("numCols", IntegerType, nullable = false), StructField("colPtrs", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("rowIndices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("isTransposed", BooleanType, nullable = false) )) } override def serialize(obj: Matrix): InternalRow = { val row = new GenericInternalRow(7) obj match { case sm: SparseMatrix => row.setByte(0, 0) row.setInt(1, sm.numRows) row.setInt(2, sm.numCols) row.update(3, UnsafeArrayData.fromPrimitiveArray(sm.colPtrs)) row.update(4, UnsafeArrayData.fromPrimitiveArray(sm.rowIndices)) row.update(5, UnsafeArrayData.fromPrimitiveArray(sm.values)) row.setBoolean(6, sm.isTransposed) case dm: DenseMatrix => row.setByte(0, 1) row.setInt(1, dm.numRows) row.setInt(2, dm.numCols) row.setNullAt(3) row.setNullAt(4) row.update(5, UnsafeArrayData.fromPrimitiveArray(dm.values)) row.setBoolean(6, dm.isTransposed) } row } override def deserialize(datum: Any): Matrix = { datum match { case row: InternalRow => require(row.numFields == 7, s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7") val tpe = row.getByte(0) val numRows = row.getInt(1) val numCols = row.getInt(2) val values = row.getArray(5).toDoubleArray() val isTransposed = row.getBoolean(6) tpe match { case 0 => val colPtrs = row.getArray(3).toIntArray() val rowIndices = row.getArray(4).toIntArray() new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) case 1 => new DenseMatrix(numRows, numCols, values, isTransposed) } } } override def userClass: Class[Matrix] = classOf[Matrix] override def equals(o: Any): Boolean = { o match { case v: MatrixUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[MatrixUDT].getName.hashCode() override def typeName: String = "matrix" override def pyUDT: String = "pyspark.ml.linalg.MatrixUDT" private[spark] override def asNullable: MatrixUDT = this }
Example 13
Source File: EncodeLongTest.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.encoders import org.apache.spark.sql.catalyst.expressions.codegen.GenerateMutableProjection import org.apache.spark.sql.catalyst.expressions.{Alias, GenericInternalRow} import org.apache.spark.sql.functions import org.apache.spark.sql.functions.typedLit import org.opencypher.morpheus.api.value.MorpheusElement._ import org.opencypher.morpheus.impl.expressions.EncodeLong import org.opencypher.morpheus.impl.expressions.EncodeLong._ import org.opencypher.morpheus.testing.MorpheusTestSuite import org.scalatestplus.scalacheck.Checkers class EncodeLongTest extends MorpheusTestSuite with Checkers { it("encodes longs correctly") { check((l: Long) => { val scala = l.encodeAsMorpheusId.toList val spark = typedLit[Long](l).encodeLongAsMorpheusId.expr.eval().asInstanceOf[Array[Byte]].toList scala === spark }, minSuccessful(1000)) } it("encoding/decoding is symmetric") { check((l: Long) => { val encoded = l.encodeAsMorpheusId val decoded = decodeLong(encoded) decoded === l }, minSuccessful(1000)) } it("scala version encodes longs correctly") { 0L.encodeAsMorpheusId.toList should equal(List(0.toByte)) } it("spark version encodes longs correctly") { typedLit[Long](0L).encodeLongAsMorpheusId.expr.eval().asInstanceOf[Array[Byte]].array.toList should equal(List(0.toByte)) } describe("Spark expression") { it("converts longs into byte arrays using expression interpreter") { check((l: Long) => { val positive = l & Long.MaxValue val inputRow = new GenericInternalRow(Array[Any](positive)) val encodeLong = EncodeLong(functions.lit(positive).expr) val interpreted = encodeLong.eval(inputRow).asInstanceOf[Array[Byte]] val decoded = decodeLong(interpreted) decoded === positive }, minSuccessful(1000)) } it("converts longs into byte arrays using expression code gen") { check((l: Long) => { val positive = l & Long.MaxValue val inputRow = new GenericInternalRow(Array[Any](positive)) val encodeLong = EncodeLong(functions.lit(positive).expr) val plan = GenerateMutableProjection.generate(Alias(encodeLong, s"Optimized($encodeLong)")() :: Nil) val codegen = plan(inputRow).get(0, encodeLong.dataType).asInstanceOf[Array[Byte]] val decoded = decodeLong(codegen) decoded === positive }, minSuccessful(1000)) } } }
Example 14
Source File: VectorUDT.scala From ann4s with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.nn import ann4s.{Vector0, Vector16, Vector8, Vector32, Vector64, Vector} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ class VectorUDT extends UserDefinedType[Vector] { override def sqlType: DataType = _sqlType override def serialize(obj: Vector): InternalRow = { val row = new GenericInternalRow(5) row.setNullAt(1) row.setNullAt(2) row.setNullAt(3) row.setNullAt(4) obj match { case Vector0 => row.setByte(0, 0) case Vector8(values, w, b) => row.setByte(0, 1) row.update(1, UnsafeArrayData.fromPrimitiveArray(values)) row.update(3, UnsafeArrayData.fromPrimitiveArray(Array(w, b))) case Vector16(values) => row.setByte(0, 2) row.update(2, UnsafeArrayData.fromPrimitiveArray(values)) case Vector32(values) => row.setByte(0, 3) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) case Vector64(values) => row.setByte(0, 4) row.update(4, UnsafeArrayData.fromPrimitiveArray(values)) } row } override def deserialize(datum: Any): Vector = { datum match { case row: InternalRow => require(row.numFields == 5, s"nn.VectorUDT.deserialize given row with length ${row.numFields} but requires length == 5") val tpe = row.getByte(0) tpe match { case 0 => Vector0 case 1 => val wb = row.getArray(3).toFloatArray() Vector8(row.getArray(1).toByteArray(), wb(0), wb(1)) case 2 => Vector16(row.getArray(2).toShortArray()) case 3 => Vector32(row.getArray(3).toFloatArray()) case 4 => Vector64(row.getArray(4).toDoubleArray()) } } } override def userClass: Class[Vector] = classOf[Vector] override def equals(o: Any): Boolean = { o match { case _: VectorUDT => true case _ => false } } override def hashCode(): Int = classOf[VectorUDT].getName.hashCode override def typeName: String = "nn.vector" private[spark] override def asNullable: VectorUDT = this private[this] val _sqlType = { StructType(Seq( StructField("type", ByteType, nullable = false), StructField("fixed8", ArrayType(ByteType, containsNull = false), nullable = true), StructField("fixed16", ArrayType(ShortType, containsNull = false), nullable = true), StructField("float32", ArrayType(FloatType, containsNull = false), nullable = true), StructField("float64", ArrayType(DoubleType, containsNull = false), nullable = true))) } } object VectorUDT { def register(): Unit = { UDTRegistration.register("ann4s.Vector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.EmptyVector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.Fixed8Vector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.Fixed16Vector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.Float32Vector", "org.apache.spark.ml.nn.VectorUDT") UDTRegistration.register("ann4s.Float64Vector", "org.apache.spark.ml.nn.VectorUDT") } }
Example 15
Source File: SparkUnsafeRowReadSuport.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.carbondata.execution.datasources.readsupport import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types.StructType import org.apache.carbondata.core.metadata.schema.table.CarbonTable import org.apache.carbondata.core.metadata.schema.table.column.CarbonColumn import org.apache.carbondata.hadoop.readsupport.CarbonReadSupport class SparkUnsafeRowReadSuport(requiredSchema: StructType) extends CarbonReadSupport[InternalRow] { private val unsafeProjection = UnsafeProjection.create(requiredSchema) override def initialize(carbonColumns: Array[CarbonColumn], carbonTable: CarbonTable): Unit = { } override def readRow(data: Array[AnyRef]): InternalRow = { unsafeProjection(new GenericInternalRow(data.asInstanceOf[Array[Any]])) } override def close(): Unit = { // Nothing to close } }
Example 16
Source File: MergeProjection.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command.mutation.merge import java.sql.{Date, Timestamp} import org.apache.spark.sql.{CarbonDatasourceHadoopRelation, Dataset, Row, SparkSession} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression, GenericInternalRow, GenericRowWithSchema, InterpretedMutableProjection, Projection} import org.apache.spark.sql.catalyst.util.DateTimeUtils case class MergeProjection( @transient tableCols: Seq[String], @transient statusCol : String, @transient ds: Dataset[Row], @transient rltn: CarbonDatasourceHadoopRelation, @transient sparkSession: SparkSession, @transient mergeAction: MergeAction) { private val cutOffDate = Integer.MAX_VALUE >> 1 val isUpdate = mergeAction.isInstanceOf[UpdateAction] val isDelete = mergeAction.isInstanceOf[DeleteAction] def apply(row: GenericRowWithSchema): InternalRow = { // TODO we can avoid these multiple conversions if this is added as a SparkPlan node. val values = row.values.map { case s: String => org.apache.spark.unsafe.types.UTF8String.fromString(s) case d: java.math.BigDecimal => org.apache.spark.sql.types.Decimal.apply(d) case b: Array[Byte] => org.apache.spark.unsafe.types.UTF8String.fromBytes(b) case d: Date => DateTimeUtils.fromJavaDate(d) case t: Timestamp => DateTimeUtils.fromJavaTimestamp(t) case value => value } projection(new GenericInternalRow(values)).asInstanceOf[GenericInternalRow] } val (projection, output) = generateProjection private def generateProjection: (Projection, Array[Expression]) = { val existingDsOutput = rltn.carbonRelation.schema.toAttributes val colsMap = mergeAction match { case UpdateAction(updateMap) => updateMap case InsertAction(insertMap) => insertMap case _ => null } if (colsMap != null) { val output = new Array[Expression](tableCols.length) val expecOutput = new Array[Expression](tableCols.length) colsMap.foreach { case (k, v) => val tableIndex = tableCols.indexOf(k.toString().toLowerCase) if (tableIndex < 0) { throw new CarbonMergeDataSetException(s"Mapping is wrong $colsMap") } output(tableIndex) = v.expr.transform { case a: Attribute if !a.resolved => ds.queryExecution.analyzed.resolveQuoted(a.name, sparkSession.sessionState.analyzer.resolver).get } expecOutput(tableIndex) = existingDsOutput.find(_.name.equalsIgnoreCase(tableCols(tableIndex))).get } if (output.contains(null)) { throw new CarbonMergeDataSetException(s"Not all columns are mapped") } (new InterpretedMutableProjection(output++Seq( ds.queryExecution.analyzed.resolveQuoted(statusCol, sparkSession.sessionState.analyzer.resolver).get), ds.queryExecution.analyzed.output), expecOutput) } else { (null, null) } } }
Example 17
Source File: RowSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, SpecificInternalRow} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String class RowSuite extends SparkFunSuite with SharedSQLContext { import testImplicits._ test("create row") { val expected = new GenericInternalRow(4) expected.setInt(0, 2147483647) expected.update(1, UTF8String.fromString("this is a string")) expected.setBoolean(2, false) expected.setNullAt(3) val actual1 = Row(2147483647, "this is a string", false, null) assert(expected.numFields === actual1.size) assert(expected.getInt(0) === actual1.getInt(0)) assert(expected.getString(1) === actual1.getString(1)) assert(expected.getBoolean(2) === actual1.getBoolean(2)) assert(expected.isNullAt(3) === actual1.isNullAt(3)) val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null)) assert(expected.numFields === actual2.size) assert(expected.getInt(0) === actual2.getInt(0)) assert(expected.getString(1) === actual2.getString(1)) assert(expected.getBoolean(2) === actual2.getBoolean(2)) assert(expected.isNullAt(3) === actual2.isNullAt(3)) } test("SpecificMutableRow.update with null") { val row = new SpecificInternalRow(Seq(IntegerType)) row(0) = null assert(row.isNullAt(0)) } test("get values by field name on Row created via .toDF") { val row = Seq((1, Seq(1))).toDF("a", "b").first() assert(row.getAs[Int]("a") === 1) assert(row.getAs[Seq[Int]]("b") === Seq(1)) intercept[IllegalArgumentException]{ row.getAs[Int]("c") } } test("float NaN == NaN") { val r1 = Row(Float.NaN) val r2 = Row(Float.NaN) assert(r1 === r2) } test("double NaN == NaN") { val r1 = Row(Double.NaN) val r2 = Row(Double.NaN) assert(r1 === r2) } test("equals and hashCode") { val r1 = Row("Hello") val r2 = Row("Hello") assert(r1 === r2) assert(r1.hashCode() === r2.hashCode()) val r3 = Row("World") assert(r3.hashCode() != r1.hashCode()) } }
Example 18
Source File: ColumnarTestUtils.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, GenericMutableRow} import org.apache.spark.sql.catalyst.util.{GenericArrayData, ArrayBasedMapData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericMutableRow = { val row = new GenericMutableRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericMutableRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericMutableRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericMutableRow(1) row(0) = value row } (values, rows) } }
Example 19
Source File: PgWireProtocolSuite.scala From spark-sql-server with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.server.service.postgresql.protocol.v3 import java.nio.ByteBuffer import java.nio.charset.StandardCharsets import java.sql.SQLException import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.types.UTF8String class PgWireProtocolSuite extends SparkFunSuite { val conf = new SQLConf() test("DataRow") { val v3Protocol = new PgWireProtocol(65536) val row = new GenericInternalRow(2) row.update(0, 8) row.update(1, UTF8String.fromString("abcdefghij")) val schema = StructType.fromDDL("a INT, b STRING") val rowConverters = PgRowConverters(conf, schema, Seq(true, false)) val data = v3Protocol.DataRow(row, rowConverters) val bytes = ByteBuffer.wrap(data) assert(bytes.get() === 'D'.toByte) assert(bytes.getInt === 28) assert(bytes.getShort === 2) assert(bytes.getInt === 4) assert(bytes.getInt === 8) assert(bytes.getInt === 10) assert(data.slice(19, 30) === "abcdefghij".getBytes(StandardCharsets.UTF_8)) } test("Fails when message buffer overflowed") { val v3Protocol = new PgWireProtocol(4) val row = new GenericInternalRow(1) row.update(0, UTF8String.fromString("abcdefghijk")) val schema = StructType.fromDDL("a STRING") val rowConverters = PgRowConverters(conf, schema, Seq(false)) val errMsg = intercept[SQLException] { v3Protocol.DataRow(row, rowConverters) }.getMessage assert(errMsg.contains( "Cannot generate a V3 protocol message because buffer is not enough for the message. " + "To avoid this exception, you might set higher value at " + "'spark.sql.server.messageBufferSizeInBytes'") ) } }
Example 20
Source File: RowConverter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.util import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.StructType class RowConverter[T](schema: StructType, fieldConverters: Array[RowConverter.Updater[T]]) { def apply(record: T): InternalRow = { val nullRow = new GenericInternalRow(schema.length) apply(record, nullRow) } // WARNING: this will modify priorRow that is passed in def apply(record: T, priorRow: InternalRow): InternalRow = { var i = 0 while (i < schema.length) { fieldConverters(i)(record, priorRow, i) i += 1 } priorRow } } object RowConverter { type Updater[T] = (T, InternalRow, Int) => Unit }
Example 21
Source File: MomentAggState.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{DoubleType, StructField, StructType} import io.projectglow.common.GlowLogging def toInternalRow(row: InternalRow, offset: Int = 0): InternalRow = { row.update(offset, if (count > 0) mean else null) row.update(offset + 1, if (count > 0) Math.sqrt(m2 / (count - 1)) else null) row.update(offset + 2, if (count > 0) min else null) row.update(offset + 3, if (count > 0) max else null) row } def toInternalRow: InternalRow = { toInternalRow(new GenericInternalRow(4)) } } object MomentAggState extends GlowLogging { val schema = StructType( Seq( StructField("mean", DoubleType), StructField("stdDev", DoubleType), StructField("min", DoubleType), StructField("max", DoubleType) ) ) def merge(s1: MomentAggState, s2: MomentAggState): MomentAggState = { if (s1.count == 0) { return s2 } else if (s2.count == 0) { return s1 } val newState = MomentAggState() newState.count = s1.count + s2.count val delta = s2.mean - s1.mean val deltaN = delta / newState.count newState.mean = s1.mean + deltaN * s2.count // higher order moments computed according to: // https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Higher-order_statistics newState.m2 = s1.m2 + s2.m2 + delta * deltaN * s1.count * s2.count newState.min = Math.min(s1.min, s2.min) newState.max = Math.max(s1.max, s2.max) newState } }
Example 22
Source File: UTF8TextOutputFormatter.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.transformers.pipe import java.io.InputStream import scala.collection.JavaConverters._ import org.apache.commons.io.IOUtils import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{StringType, StructField, StructType} import org.apache.spark.unsafe.types.UTF8String class UTF8TextOutputFormatter() extends OutputFormatter { override def makeIterator(stream: InputStream): Iterator[Any] = { val schema = StructType(Seq(StructField("text", StringType))) val iter = IOUtils.lineIterator(stream, "UTF-8").asScala.map { s => new GenericInternalRow(Array(UTF8String.fromString(s)): Array[Any]) } Iterator(schema) ++ iter } } class UTF8TextOutputFormatterFactory extends OutputFormatterFactory { override def name: String = "text" override def makeOutputFormatter(options: Map[String, String]): OutputFormatter = { new UTF8TextOutputFormatter } }
Example 23
Source File: VectorUDT.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ /** * User-defined type for [[Vector]] which allows easy interaction with SQL * via [[org.apache.spark.sql.Dataset]]. */ class VectorUDT extends UserDefinedType[Vector] { override final def sqlType: StructType = { // type: 0 = int_sparse, 1 = dense, 2 = long_sparse // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse // vectors. The "values" field is nullable because we might want to add binary vectors later, // which uses "size" and "indices", but not "values". StructType(Seq( StructField("type", ByteType, nullable = false), StructField("size", LongType, nullable = true), StructField("intIndices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("longIndices", ArrayType(LongType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) } override def serialize(obj: Vector): InternalRow = { obj match { case IntSparseVector(size, indices, values) => val row = new GenericInternalRow(5) row.setByte(0, 0) row.setLong(1, size) row.update(2, UnsafeArrayData.fromPrimitiveArray(indices)) row.setNullAt(3) row.update(4, UnsafeArrayData.fromPrimitiveArray(values)) row case DenseVector(values) => val row = new GenericInternalRow(5) row.setByte(0, 1) row.setNullAt(1) row.setNullAt(2) row.setNullAt(3) row.update(4, UnsafeArrayData.fromPrimitiveArray(values)) row case LongSparseVector(size, indices, values) => val row = new GenericInternalRow(5) row.setByte(0, 2) row.setLong(1, size) row.setNullAt(2) row.update(3, UnsafeArrayData.fromPrimitiveArray(indices)) row.update(4, UnsafeArrayData.fromPrimitiveArray(values)) row } } override def deserialize(datum: Any): Vector = { datum match { case row: InternalRow => require(row.numFields == 5, s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4") val tpe = row.getByte(0) tpe match { case 0 => val size = row.getLong(1) val indices = row.getArray(2).toIntArray() val values = row.getArray(4).toDoubleArray() new IntSparseVector(size, indices, values) case 1 => val values = row.getArray(4).toDoubleArray() new DenseVector(values) case 2 => val size = row.getLong(1) val indices = row.getArray(3).toLongArray() val values = row.getArray(4).toDoubleArray() new LongSparseVector(size, indices, values) } } } override def pyUDT: String = "pyspark.ml.linalg.VectorUDT" override def userClass: Class[Vector] = classOf[Vector] override def equals(o: Any): Boolean = { o match { case v: VectorUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[VectorUDT].getName.hashCode() override def typeName: String = "vector" override def asNullable: VectorUDT = this }
Example 24
Source File: MatrixUDT.scala From sona with Apache License 2.0 | 5 votes |
package org.apache.spark.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ /** * User-defined type for [[Matrix]] which allows easy interaction with SQL * via [[org.apache.spark.sql.Dataset]]. */ class MatrixUDT extends UserDefinedType[Matrix] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // the dense matrix is built by numRows, numCols, values and isTransposed, all of which are // set as not nullable, except values since in the future, support for binary matrices might // be added for which values are not needed. // the sparse matrix needs colPtrs and rowIndices, which are set as // null, while building the dense matrix. StructType(Seq( StructField("type", ByteType, nullable = false), StructField("numRows", IntegerType, nullable = false), StructField("numCols", IntegerType, nullable = false), StructField("colPtrs", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("rowIndices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("isTransposed", BooleanType, nullable = false) )) } override def serialize(obj: Matrix): InternalRow = { val row = new GenericInternalRow(7) obj match { case sm: SparseMatrix => row.setByte(0, 0) row.setInt(1, sm.numRows) row.setInt(2, sm.numCols) row.update(3, UnsafeArrayData.fromPrimitiveArray(sm.colPtrs)) row.update(4, UnsafeArrayData.fromPrimitiveArray(sm.rowIndices)) row.update(5, UnsafeArrayData.fromPrimitiveArray(sm.values)) row.setBoolean(6, sm.isTransposed) case dm: DenseMatrix => row.setByte(0, 1) row.setInt(1, dm.numRows) row.setInt(2, dm.numCols) row.setNullAt(3) row.setNullAt(4) row.update(5, UnsafeArrayData.fromPrimitiveArray(dm.values)) row.setBoolean(6, dm.isTransposed) } row } override def deserialize(datum: Any): Matrix = { datum match { case row: InternalRow => require(row.numFields == 7, s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7") val tpe = row.getByte(0) val numRows = row.getInt(1) val numCols = row.getInt(2) val values = row.getArray(5).toDoubleArray() val isTransposed = row.getBoolean(6) tpe match { case 0 => val colPtrs = row.getArray(3).toIntArray() val rowIndices = row.getArray(4).toIntArray() new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) case 1 => new DenseMatrix(numRows, numCols, values, isTransposed) } } } override def userClass: Class[Matrix] = classOf[Matrix] override def equals(o: Any): Boolean = { o match { case v: MatrixUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[MatrixUDT].getName.hashCode() override def typeName: String = "matrix" override def pyUDT: String = "pyspark.ml.linalg.MatrixUDT" override def asNullable: MatrixUDT = this }
Example 25
Source File: JoinOptimizerChromosome.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.rangejoins.optimizer import jdk.nashorn.internal.ir.debug.ObjectSizeCalculator import org.apache.log4j.Logger import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.util.SizeEstimator import org.biodatageeks.sequila.rangejoins.IntervalTree.{Interval, IntervalWithRow} import org.biodatageeks.sequila.rangejoins.optimizer.RangeJoinMethod.RangeJoinMethod class JoinOptimizerChromosome(spark: SparkSession, rdd: RDD[(String,Interval[Int],InternalRow)], rddCount : Long) { val logger = Logger.getLogger(this.getClass.getCanonicalName) val maxBroadcastSize = spark.sqlContext .getConf("spark.biodatageeks.rangejoin.maxBroadcastSize","0") match { case "0" => 0.1*scala.math.max((spark.sparkContext.getConf.getSizeAsBytes("spark.driver.memory","0")),1024*(1024*1024)) //defaults 128MB or 0.1 * Spark Driver's memory case _ => spark.sqlContext.getConf("spark.biodatageeks.rangejoin.maxBroadcastSize").toLong } val estBroadcastSize = estimateBroadcastSize(rdd,rddCount) private def estimateBroadcastSize(rdd: RDD[(String,Interval[Int],InternalRow)], rddCount: Long): Long = { try{ (ObjectSizeCalculator.getObjectSize(rdd.first()) * rddCount) /10 } catch { case e @ (_ : NoClassDefFoundError | _ : ExceptionInInitializerError ) => { logger.warn("Method ObjectSizeCalculator.getObjectSize not available falling back to Spark methods") SizeEstimator.estimate(rdd.first()) * rddCount } } //FIXME: Do not know why the size ~10x the actual size is- Spark row representation or getObject size in bits??? } def debugInfo = { s""" |Broadcast structure size is ~ ${math.rint(100*estBroadcastSize/1024.0)/100} kb |spark.biodatageeks.rangejoin.maxBroadcastSize is set to ${(maxBroadcastSize/1024).toInt} kb" |Using ${getRangeJoinMethod.toString} join method """.stripMargin } private def estimateRDDSizeSpark(rdd: RDD[(String,Interval[Int],InternalRow)]): Long = { math.round(SizeEstimator.estimate(rdd)/1024.0) } def getRangeJoinMethod : RangeJoinMethod ={ if (estimateBroadcastSize(rdd, rddCount) <= maxBroadcastSize) RangeJoinMethod.JoinWithRowBroadcast else RangeJoinMethod.TwoPhaseJoin } }
Example 26
Source File: ExtraStrategiesSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package test.org.apache.spark.sql import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Literal, GenericInternalRow, Attribute} import org.apache.spark.sql.catalyst.plans.logical.{Project, LogicalPlan} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.{Row, Strategy, QueryTest} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.unsafe.types.UTF8String case class FastOperator(output: Seq[Attribute]) extends SparkPlan { override protected def doExecute(): RDD[InternalRow] = { val str = Literal("so fast").value val row = new GenericInternalRow(Array[Any](str)) sparkContext.parallelize(Seq(row)) } override def children: Seq[SparkPlan] = Nil } object TestStrategy extends Strategy { def apply(plan: LogicalPlan): Seq[SparkPlan] = plan match { case Project(Seq(attr), _) if attr.name == "a" => FastOperator(attr.toAttribute :: Nil) :: Nil case _ => Nil } } class ExtraStrategiesSuite extends QueryTest with SharedSQLContext { import testImplicits._ test("insert an extraStrategy") { try { sqlContext.experimental.extraStrategies = TestStrategy :: Nil val df = sparkContext.parallelize(Seq(("so slow", 1))).toDF("a", "b") checkAnswer( df.select("a"), Row("so fast")) checkAnswer( df.select("a", "b"), Row("so slow", 1)) } finally { sqlContext.experimental.extraStrategies = Nil } } }
Example 27
Source File: ColumnarTestUtils.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 28
Source File: ColumnStatsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types._ class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[BooleanColumnStats], BOOLEAN, createRow(true, false, 0)) testColumnStats(classOf[ByteColumnStats], BYTE, createRow(Byte.MaxValue, Byte.MinValue, 0)) testColumnStats(classOf[ShortColumnStats], SHORT, createRow(Short.MaxValue, Short.MinValue, 0)) testColumnStats(classOf[IntColumnStats], INT, createRow(Int.MaxValue, Int.MinValue, 0)) testColumnStats(classOf[LongColumnStats], LONG, createRow(Long.MaxValue, Long.MinValue, 0)) testColumnStats(classOf[FloatColumnStats], FLOAT, createRow(Float.MaxValue, Float.MinValue, 0)) testColumnStats(classOf[DoubleColumnStats], DOUBLE, createRow(Double.MaxValue, Double.MinValue, 0)) testColumnStats(classOf[StringColumnStats], STRING, createRow(null, null, 0)) testDecimalColumnStats(createRow(null, null, 0)) def createRow(values: Any*): GenericInternalRow = new GenericInternalRow(values.toArray) def testColumnStats[T <: AtomicType, U <: ColumnStats]( columnStatsClass: Class[U], columnType: NativeColumnType[T], initialStatistics: GenericInternalRow): Unit = { val columnStatsName = columnStatsClass.getSimpleName test(s"$columnStatsName: empty") { val columnStats = columnStatsClass.newInstance() columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ val columnStats = columnStatsClass.newInstance() val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats]( initialStatistics: GenericInternalRow): Unit = { val columnStatsName = classOf[DecimalColumnStats].getSimpleName val columnType = COMPACT_DECIMAL(15, 10) test(s"$columnStatsName: empty") { val columnStats = new DecimalColumnStats(15, 10) columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ val columnStats = new DecimalColumnStats(15, 10) val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } }
Example 29
Source File: RowSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, SpecificInternalRow} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String class RowSuite extends SparkFunSuite with SharedSQLContext { import testImplicits._ test("create row") { val expected = new GenericInternalRow(4) expected.setInt(0, 2147483647) expected.update(1, UTF8String.fromString("this is a string")) expected.setBoolean(2, false) expected.setNullAt(3) val actual1 = Row(2147483647, "this is a string", false, null) assert(expected.numFields === actual1.size) assert(expected.getInt(0) === actual1.getInt(0)) assert(expected.getString(1) === actual1.getString(1)) assert(expected.getBoolean(2) === actual1.getBoolean(2)) assert(expected.isNullAt(3) === actual1.isNullAt(3)) val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null)) assert(expected.numFields === actual2.size) assert(expected.getInt(0) === actual2.getInt(0)) assert(expected.getString(1) === actual2.getString(1)) assert(expected.getBoolean(2) === actual2.getBoolean(2)) assert(expected.isNullAt(3) === actual2.isNullAt(3)) } test("SpecificMutableRow.update with null") { val row = new SpecificInternalRow(Seq(IntegerType)) row(0) = null assert(row.isNullAt(0)) } test("get values by field name on Row created via .toDF") { val row = Seq((1, Seq(1))).toDF("a", "b").first() assert(row.getAs[Int]("a") === 1) assert(row.getAs[Seq[Int]]("b") === Seq(1)) intercept[IllegalArgumentException]{ row.getAs[Int]("c") } } test("float NaN == NaN") { val r1 = Row(Float.NaN) val r2 = Row(Float.NaN) assert(r1 === r2) } test("double NaN == NaN") { val r1 = Row(Double.NaN) val r2 = Row(Double.NaN) assert(r1 === r2) } test("equals and hashCode") { val r1 = Row("Hello") val r2 = Row("Hello") assert(r1 === r2) assert(r1.hashCode() === r2.hashCode()) val r3 = Row("World") assert(r3.hashCode() != r1.hashCode()) } }
Example 30
Source File: ColumnarTestUtils.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 31
Source File: NullableColumnAccessorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 32
Source File: NullableColumnBuilderSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 33
Source File: FailureSafeParser.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.types.UTF8String class FailureSafeParser[IN]( rawParser: IN => Seq[InternalRow], mode: ParseMode, schema: StructType, columnNameOfCorruptRecord: String) { private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord) private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord)) private val resultRow = new GenericInternalRow(schema.length) private val nullResult = new GenericInternalRow(schema.length) // This function takes 2 parameters: an optional partial result, and the bad record. If the given // schema doesn't contain a field for corrupted record, we just return the partial result or a // row with all fields null. If the given schema contains a field for corrupted record, we will // set the bad record to this field, and set other fields according to the partial result or null. private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = { if (corruptFieldIndex.isDefined) { (row, badRecord) => { var i = 0 while (i < actualSchema.length) { val from = actualSchema(i) resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull i += 1 } resultRow(corruptFieldIndex.get) = badRecord() resultRow } } else { (row, _) => row.getOrElse(nullResult) } } def parse(input: IN): Iterator[InternalRow] = { try { rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null)) } catch { case e: BadRecordException => mode match { case PermissiveMode => Iterator(toResultRow(e.partialResult(), e.record)) case DropMalformedMode => Iterator.empty case FailFastMode => throw new SparkException("Malformed records are detected in record parsing. " + s"Parse Mode: ${FailFastMode.name}.", e.cause) } } } }
Example 34
Source File: ComplexDataSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.util import scala.collection._ import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{BoundReference, GenericInternalRow, SpecificInternalRow, UnsafeMapData, UnsafeProjection} import org.apache.spark.sql.catalyst.expressions.codegen.GenerateUnsafeProjection import org.apache.spark.sql.types.{DataType, IntegerType, MapType, StringType} import org.apache.spark.unsafe.types.UTF8String class ComplexDataSuite extends SparkFunSuite { def utf8(str: String): UTF8String = UTF8String.fromString(str) test("inequality tests for MapData") { // test data val testMap1 = Map(utf8("key1") -> 1) val testMap2 = Map(utf8("key1") -> 1, utf8("key2") -> 2) val testMap3 = Map(utf8("key1") -> 1) val testMap4 = Map(utf8("key1") -> 1, utf8("key2") -> 2) // ArrayBasedMapData val testArrayMap1 = ArrayBasedMapData(testMap1.toMap) val testArrayMap2 = ArrayBasedMapData(testMap2.toMap) val testArrayMap3 = ArrayBasedMapData(testMap3.toMap) val testArrayMap4 = ArrayBasedMapData(testMap4.toMap) assert(testArrayMap1 !== testArrayMap3) assert(testArrayMap2 !== testArrayMap4) // UnsafeMapData val unsafeConverter = UnsafeProjection.create(Array[DataType](MapType(StringType, IntegerType))) val row = new GenericInternalRow(1) def toUnsafeMap(map: ArrayBasedMapData): UnsafeMapData = { row.update(0, map) val unsafeRow = unsafeConverter.apply(row) unsafeRow.getMap(0).copy } assert(toUnsafeMap(testArrayMap1) !== toUnsafeMap(testArrayMap3)) assert(toUnsafeMap(testArrayMap2) !== toUnsafeMap(testArrayMap4)) } test("GenericInternalRow.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val genericRow = new GenericInternalRow(Array[Any](unsafeRow.getUTF8String(0))) val copiedGenericRow = genericRow.copy() assert(copiedGenericRow.getString(0) == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied internal row should not be changed externally. assert(copiedGenericRow.getString(0) == "a") } test("SpecificMutableRow.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val mutableRow = new SpecificInternalRow(Seq(StringType)) mutableRow(0) = unsafeRow.getUTF8String(0) val copiedMutableRow = mutableRow.copy() assert(copiedMutableRow.getString(0) == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied internal row should not be changed externally. assert(copiedMutableRow.getString(0) == "a") } test("GenericArrayData.copy return a new instance that is independent from the old one") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val genericArray = new GenericArrayData(Array[Any](unsafeRow.getUTF8String(0))) val copiedGenericArray = genericArray.copy() assert(copiedGenericArray.getUTF8String(0).toString == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied array data should not be changed externally. assert(copiedGenericArray.getUTF8String(0).toString == "a") } test("copy on nested complex type") { val project = GenerateUnsafeProjection.generate(Seq(BoundReference(0, StringType, true))) val unsafeRow = project.apply(InternalRow(utf8("a"))) val arrayOfRow = new GenericArrayData(Array[Any](InternalRow(unsafeRow.getUTF8String(0)))) val copied = arrayOfRow.copy() assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a") project.apply(InternalRow(UTF8String.fromString("b"))) // The copied data should not be changed externally. assert(copied.getStruct(0, 1).getUTF8String(0).toString == "a") } }
Example 35
Source File: CovarianceSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.{ BaseSummarizerFactory, ColumnList, SummarizerFactory } import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{ DoubleType, StructType } case class CovarianceSummarizerFactory(columnX: String, columnY: String) extends BaseSummarizerFactory(columnX, columnY) { override def apply(inputSchema: StructType): CovarianceSummarizer = new CovarianceSummarizer(inputSchema, prefixOpt, requiredColumns) } class CovarianceSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList ) extends AbstractCorrelationSummarizer(inputSchema, prefixOpt, requiredColumns) { override val schema = Schema.of( s"${columnPrefix}_covariance" -> DoubleType ) override def fromV(v: V): GenericInternalRow = new GenericInternalRow(Array[Any](v.covariance)) }
Example 36
Source File: MatrixUDT.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // the dense matrix is built by numRows, numCols, values and isTransposed, all of which are // set as not nullable, except values since in the future, support for binary matrices might // be added for which values are not needed. // the sparse matrix needs colPtrs and rowIndices, which are set as // null, while building the dense matrix. StructType(Seq( StructField("type", ByteType, nullable = false), StructField("numRows", IntegerType, nullable = false), StructField("numCols", IntegerType, nullable = false), StructField("colPtrs", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("rowIndices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("isTransposed", BooleanType, nullable = false) )) } override def serialize(obj: Matrix): InternalRow = { val row = new GenericInternalRow(7) obj match { case sm: SparseMatrix => row.setByte(0, 0) row.setInt(1, sm.numRows) row.setInt(2, sm.numCols) row.update(3, UnsafeArrayData.fromPrimitiveArray(sm.colPtrs)) row.update(4, UnsafeArrayData.fromPrimitiveArray(sm.rowIndices)) row.update(5, UnsafeArrayData.fromPrimitiveArray(sm.values)) row.setBoolean(6, sm.isTransposed) case dm: DenseMatrix => row.setByte(0, 1) row.setInt(1, dm.numRows) row.setInt(2, dm.numCols) row.setNullAt(3) row.setNullAt(4) row.update(5, UnsafeArrayData.fromPrimitiveArray(dm.values)) row.setBoolean(6, dm.isTransposed) } row } override def deserialize(datum: Any): Matrix = { datum match { case row: InternalRow => require(row.numFields == 7, s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7") val tpe = row.getByte(0) val numRows = row.getInt(1) val numCols = row.getInt(2) val values = row.getArray(5).toDoubleArray() val isTransposed = row.getBoolean(6) tpe match { case 0 => val colPtrs = row.getArray(3).toIntArray() val rowIndices = row.getArray(4).toIntArray() new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) case 1 => new DenseMatrix(numRows, numCols, values, isTransposed) } } } override def userClass: Class[Matrix] = classOf[Matrix] override def equals(o: Any): Boolean = { o match { case v: MatrixUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[MatrixUDT].getName.hashCode() override def typeName: String = "matrix" override def pyUDT: String = "pyspark.ml.linalg.MatrixUDT" private[spark] override def asNullable: MatrixUDT = this }
Example 37
Source File: FailureSafeParser.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources import org.apache.spark.SparkException import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util._ import org.apache.spark.sql.internal.SQLConf import org.apache.spark.sql.types.StructType import org.apache.spark.unsafe.types.UTF8String class FailureSafeParser[IN]( rawParser: IN => Seq[InternalRow], mode: ParseMode, schema: StructType, columnNameOfCorruptRecord: String) { private val corruptFieldIndex = schema.getFieldIndex(columnNameOfCorruptRecord) private val actualSchema = StructType(schema.filterNot(_.name == columnNameOfCorruptRecord)) private val resultRow = new GenericInternalRow(schema.length) private val nullResult = new GenericInternalRow(schema.length) // This function takes 2 parameters: an optional partial result, and the bad record. If the given // schema doesn't contain a field for corrupted record, we just return the partial result or a // row with all fields null. If the given schema contains a field for corrupted record, we will // set the bad record to this field, and set other fields according to the partial result or null. private val toResultRow: (Option[InternalRow], () => UTF8String) => InternalRow = { if (corruptFieldIndex.isDefined) { (row, badRecord) => { var i = 0 while (i < actualSchema.length) { val from = actualSchema(i) resultRow(schema.fieldIndex(from.name)) = row.map(_.get(i, from.dataType)).orNull i += 1 } resultRow(corruptFieldIndex.get) = badRecord() resultRow } } else { (row, _) => row.getOrElse(nullResult) } } def parse(input: IN): Iterator[InternalRow] = { try { rawParser.apply(input).toIterator.map(row => toResultRow(Some(row), () => null)) } catch { case e: BadRecordException => mode match { case PermissiveMode => Iterator(toResultRow(e.partialResult(), e.record)) case DropMalformedMode => Iterator.empty case FailFastMode => throw new SparkException("Malformed records are detected in record parsing. " + s"Parse Mode: ${FailFastMode.name}.", e.cause) } } } }
Example 38
Source File: ArrowSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd.function.summarize.summarizer import java.io.ByteArrayOutputStream import java.nio.channels.Channels import java.util import com.twosigma.flint.arrow.{ ArrowFieldWriter, ArrowPayload, ArrowUtils, ArrowWriter } import org.apache.arrow.memory.{ BufferAllocator, RootAllocator } import org.apache.arrow.vector.VectorSchemaRoot import org.apache.arrow.vector.ipc.ArrowFileWriter import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.StructType import scala.collection.JavaConverters._ case class ArrowSummarizer(inputSchema: StructType, outputSchema: StructType, includeBaseRows: Boolean) extends Summarizer[InternalRow, ArrowSummarizerState, ArrowSummarizerResult] { private[this] val size = outputSchema.size require(size > 0, "Cannot create summarizer with no input columns") // This function will allocate memory from the BufferAllocator to initialize arrow vectors. override def zero(): ArrowSummarizerState = { new ArrowSummarizerState(false, null, null, null, null) } private def init(u: ArrowSummarizerState): Unit = { if (!u.initialized) { val arrowSchema = ArrowUtils.toArrowSchema(outputSchema) val allocator = new RootAllocator(Int.MaxValue) val root = VectorSchemaRoot.create(arrowSchema, allocator) val arrowWriter = ArrowWriter.create(inputSchema, outputSchema, root) u.initialized = true u.baseRows = new util.ArrayList[InternalRow]() u.allocator = allocator u.root = root u.arrowWriter = arrowWriter } } override def add(u: ArrowSummarizerState, row: InternalRow): ArrowSummarizerState = { if (!u.initialized) { init(u) } if (includeBaseRows) { u.baseRows.add(row) } u.arrowWriter.write(row) u } override def merge( u1: ArrowSummarizerState, u2: ArrowSummarizerState ): ArrowSummarizerState = throw new UnsupportedOperationException() // This can only be called once override def render(u: ArrowSummarizerState): ArrowSummarizerResult = { if (u.initialized) { val out = new ByteArrayOutputStream() val writer = new ArrowFileWriter(u.root, null, Channels.newChannel(out)) u.arrowWriter.finish() writer.writeBatch() writer.close() u.root.close() u.allocator.close() val rows = u.baseRows.toArray.asInstanceOf[Array[Any]] ArrowSummarizerResult(rows, out.toByteArray) } else { ArrowSummarizerResult(Array.empty, Array.empty) } } override def close(u: ArrowSummarizerState): Unit = { if (u.initialized) { u.arrowWriter.reset() u.root.close() u.allocator.close() } } }
Example 39
Source File: ArrowSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.{ ArrowSummarizerResult, ArrowSummarizerState, ArrowSummarizer => ArrowSum } import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList import com.twosigma.flint.timeseries.summarize.{ ColumnList, InputAlwaysValid, Summarizer, SummarizerFactory } import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types.{ ArrayType, BinaryType, StructType } object ArrowSummarizer { val baseRowsColumnName = "__baseRows" val arrowBatchColumnName = "arrow_bytes" } case class ArrowSummarizerFactory(columns: Seq[String], includeBaseRows: Boolean) extends SummarizerFactory { override val requiredColumns: ColumnList = if (includeBaseRows) { ColumnList.All } else { ColumnList.Sequence(columns) } override def apply(inputSchema: StructType): ArrowSummarizer = { val outputBatchSchema = StructType(columns.map(col => inputSchema(inputSchema.fieldIndex(col)))) ArrowSummarizer(inputSchema, outputBatchSchema, includeBaseRows, prefixOpt, requiredColumns) } } case class ArrowSummarizer( override val inputSchema: StructType, outputBatchSchema: StructType, includeBaseRows: Boolean, override val prefixOpt: Option[String], requiredColumns: ColumnList ) extends Summarizer with InputAlwaysValid { override type T = InternalRow override type U = ArrowSummarizerState override type V = ArrowSummarizerResult override val summarizer = ArrowSum(inputSchema, outputBatchSchema, includeBaseRows) override val schema: StructType = if (includeBaseRows) { Schema.of( ArrowSummarizer.baseRowsColumnName -> ArrayType(inputSchema), ArrowSummarizer.arrowBatchColumnName -> BinaryType ) } else { Schema.of( ArrowSummarizer.arrowBatchColumnName -> BinaryType ) } override def toT(r: InternalRow): T = r override def fromV(v: V): InternalRow = if (includeBaseRows) { InternalRow(new GenericArrayData(v.baseRows), v.arrowBatch) } else { InternalRow(v.arrowBatch) } }
Example 40
Source File: ExponentialWeightedMovingAverageSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.summarize._ import org.apache.spark.sql.types._ import com.twosigma.flint.rdd.function.summarize.summarizer.subtractable.{ EWMARow, ExponentialWeightedMovingAverageOutput, ExponentialWeightedMovingAverageState, ExponentialWeightedMovingAverageSummarizer => EWMASummarizer } import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList.Sequence import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow object ExponentialWeightedMovingAverageConvention extends Enumeration { type ExponentialWeightedMovingAverageConvention = Value val Core = Value("core") val Legacy = Value("legacy") } case class ExponentialWeightedMovingAverageSummarizerFactory( xColumn: String, timeColumn: String, alpha: Double, timestampsToPeriods: (Long, Long) => Double, constantPeriods: Boolean, exponentialWeightedMovingAverageConvention: ExponentialWeightedMovingAverageConvention.Value ) extends BaseSummarizerFactory(xColumn, timeColumn) { override def apply( inputSchema: StructType ): ExponentialWeightedMovingAverageSummarizer = ExponentialWeightedMovingAverageSummarizer( inputSchema, prefixOpt, requiredColumns, alpha, timestampsToPeriods, constantPeriods, exponentialWeightedMovingAverageConvention ) } case class ExponentialWeightedMovingAverageSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, alpha: Double, timestampsToPeriods: (Long, Long) => Double, constantPeriods: Boolean, exponentialWeightedMovingAverageConvention: ExponentialWeightedMovingAverageConvention.Value ) extends LeftSubtractableSummarizer with FilterNullInput with TimeAwareSummarizer { private val Sequence(Seq(xColumn, timeColumn)) = requiredColumns private val xColumnId = inputSchema.fieldIndex(xColumn) private val timeColumnId = inputSchema.fieldIndex(timeColumn) private final val xExtractor = asDoubleExtractor(inputSchema(xColumnId).dataType, xColumnId) override type T = EWMARow override type U = ExponentialWeightedMovingAverageState override type V = ExponentialWeightedMovingAverageOutput override val summarizer = new EWMASummarizer( alpha, timestampsToPeriods, constantPeriods, exponentialWeightedMovingAverageConvention ) override val schema: StructType = Schema.of(s"${xColumn}_ewma" -> DoubleType) override def toT(r: InternalRow): EWMARow = EWMARow( time = getTimeNanos(r, timeColumnId), x = xExtractor(r) ) override def fromV( o: ExponentialWeightedMovingAverageOutput ): GenericInternalRow = { new GenericInternalRow( Array[Any]( o.ewma ) ) } }
Example 41
Source File: VarianceSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList.Sequence import com.twosigma.flint.timeseries.summarize.{ BaseSummarizerFactory, ColumnList } import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{ DoubleType, StructType } case class VarianceSummarizerFactory(column: String, applyBesselCorrection: Boolean = true) extends BaseSummarizerFactory(column) { override def apply(inputSchema: StructType): VarianceSummarizer = new VarianceSummarizer(inputSchema, prefixOpt, requiredColumns, applyBesselCorrection) } class VarianceSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, val applyBesselCorrection: Boolean ) extends NthCentralMomentSummarizer(inputSchema, prefixOpt, requiredColumns, 2) { private val Sequence(Seq(column)) = requiredColumns override val schema = Schema.of(s"${column}_variance" -> DoubleType) override def fromV(v: V): GenericInternalRow = { var variance = v.nthCentralMoment(2) if (applyBesselCorrection) { variance = variance * (v.count / (v.count - 1d)) } new GenericInternalRow(Array[Any](variance)) } }
Example 42
Source File: StandardDeviationSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList.Sequence import com.twosigma.flint.timeseries.summarize.{ BaseSummarizerFactory, ColumnList } import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.{ DoubleType, StructType } import scala.math.sqrt case class StandardDeviationSummarizerFactory(column: String, applyBesselCorrection: Boolean = true) extends BaseSummarizerFactory(column) { override def apply(inputSchema: StructType): StandardDeviationSummarizer = new StandardDeviationSummarizer(inputSchema, prefixOpt, requiredColumns, applyBesselCorrection) } class StandardDeviationSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, val applyBesselCorrection: Boolean ) extends NthCentralMomentSummarizer(inputSchema, prefixOpt, requiredColumns, 2) { private val Sequence(Seq(column)) = requiredColumns override val schema = Schema.of(s"${column}_stddev" -> DoubleType) override def fromV(v: V): GenericInternalRow = { var variance = v.nthCentralMoment(2) if (applyBesselCorrection) { variance = variance * (v.count / (v.count - 1d)) } new GenericInternalRow(Array[Any](sqrt(variance))) } }
Example 43
Source File: ExponentialSmoothingSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.rdd.function.summarize.summarizer.{ ExponentialSmoothingOutput, ExponentialSmoothingState, SmoothingRow, ExponentialSmoothingSummarizer => ESSummarizer } import com.twosigma.flint.timeseries.summarize._ import org.apache.spark.sql.types._ import com.twosigma.flint.timeseries.row.Schema import com.twosigma.flint.timeseries.summarize.ColumnList.Sequence import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow object ExponentialSmoothingInterpolation extends Enumeration { type ExponentialSmoothingInterpolation = Value val PreviousPoint = Value("previous") val LinearInterpolation = Value("linear") val CurrentPoint = Value("current") } object ExponentialSmoothingConvention extends Enumeration { type ExponentialSmoothingConvention = Value val Core = Value("core") val Convolution = Value("convolution") val Legacy = Value("legacy") } case class ExponentialSmoothingSummarizerFactory( xColumn: String, timeColumn: String, alpha: Double, primingPeriods: Double, timestampsToPeriods: (Long, Long) => Double, exponentialSmoothingInterpolation: ExponentialSmoothingInterpolation.Value, exponentialSmoothingConvention: ExponentialSmoothingConvention.Value ) extends BaseSummarizerFactory(xColumn, timeColumn) { override def apply(inputSchema: StructType): ExponentialSmoothingSummarizer = ExponentialSmoothingSummarizer( inputSchema, prefixOpt, requiredColumns, alpha, primingPeriods, timestampsToPeriods, exponentialSmoothingInterpolation, exponentialSmoothingConvention ) } case class ExponentialSmoothingSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, alpha: Double, primingPeriods: Double, timestampsToPeriods: (Long, Long) => Double, exponentialSmoothingType: ExponentialSmoothingInterpolation.Value, exponentialSmoothingConvention: ExponentialSmoothingConvention.Value ) extends FlippableSummarizer with FilterNullInput with TimeAwareSummarizer { private val Sequence(Seq(xColumn, timeColumn)) = requiredColumns private val xColumnId = inputSchema.fieldIndex(xColumn) private val timeColumnId = inputSchema.fieldIndex(timeColumn) private final val xExtractor = asDoubleExtractor(inputSchema(xColumnId).dataType, xColumnId) override type T = SmoothingRow override type U = ExponentialSmoothingState override type V = ExponentialSmoothingOutput override val summarizer = ESSummarizer( alpha, primingPeriods, timestampsToPeriods, exponentialSmoothingType, exponentialSmoothingConvention ) override val schema: StructType = Schema.of(s"${xColumn}_ema" -> DoubleType) override def toT(r: InternalRow): SmoothingRow = { SmoothingRow( time = getTimeNanos(r, timeColumnId), x = xExtractor(r) ) } override def fromV(o: ExponentialSmoothingOutput): GenericInternalRow = { new GenericInternalRow( Array[Any]( o.es ) ) } }
Example 44
Source File: StreamingGlobalLimitExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import java.util.concurrent.TimeUnit.NANOSECONDS import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.catalyst.expressions.UnsafeRow import org.apache.spark.sql.catalyst.plans.physical.{AllTuples, Distribution, Partitioning} import org.apache.spark.sql.catalyst.streaming.InternalOutputModes import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.execution.streaming.state.StateStoreOps import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.{LongType, NullType, StructField, StructType} import org.apache.spark.util.CompletionIterator case class StreamingGlobalLimitExec( streamLimit: Long, child: SparkPlan, stateInfo: Option[StatefulOperatorStateInfo] = None, outputMode: Option[OutputMode] = None) extends UnaryExecNode with StateStoreWriter { private val keySchema = StructType(Array(StructField("key", NullType))) private val valueSchema = StructType(Array(StructField("value", LongType))) override protected def doExecute(): RDD[InternalRow] = { metrics // force lazy init at driver assert(outputMode.isDefined && outputMode.get == InternalOutputModes.Append, "StreamingGlobalLimitExec is only valid for streams in Append output mode") child.execute().mapPartitionsWithStateStore( getStateInfo, keySchema, valueSchema, indexOrdinal = None, sqlContext.sessionState, Some(sqlContext.streams.stateStoreCoordinator)) { (store, iter) => val key = UnsafeProjection.create(keySchema)(new GenericInternalRow(Array[Any](null))) val numOutputRows = longMetric("numOutputRows") val numUpdatedStateRows = longMetric("numUpdatedStateRows") val allUpdatesTimeMs = longMetric("allUpdatesTimeMs") val commitTimeMs = longMetric("commitTimeMs") val updatesStartTimeNs = System.nanoTime val preBatchRowCount: Long = Option(store.get(key)).map(_.getLong(0)).getOrElse(0L) var cumulativeRowCount = preBatchRowCount val result = iter.filter { r => val x = cumulativeRowCount < streamLimit if (x) { cumulativeRowCount += 1 } x } CompletionIterator[InternalRow, Iterator[InternalRow]](result, { if (cumulativeRowCount > preBatchRowCount) { numUpdatedStateRows += 1 numOutputRows += cumulativeRowCount - preBatchRowCount store.put(key, getValueRow(cumulativeRowCount)) } allUpdatesTimeMs += NANOSECONDS.toMillis(System.nanoTime - updatesStartTimeNs) commitTimeMs += timeTakenMs { store.commit() } setStoreMetrics(store) }) } } override def output: Seq[Attribute] = child.output override def outputPartitioning: Partitioning = child.outputPartitioning override def requiredChildDistribution: Seq[Distribution] = AllTuples :: Nil private def getValueRow(value: Long): UnsafeRow = { UnsafeProjection.create(valueSchema)(new GenericInternalRow(Array[Any](value))) } }
Example 45
Source File: CompositeSummarizerFactory.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries.summarize.summarizer import com.twosigma.flint.timeseries.summarize._ import com.twosigma.flint.timeseries.row.{ DuplicateColumnsException, Schema } import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types.StructType case class CompositeSummarizerFactory(factory1: SummarizerFactory, factory2: SummarizerFactory) extends SummarizerFactory { Seq(factory1, factory2).foreach { case factory => require( !factory.isInstanceOf[OverlappableSummarizerFactory], "Composition of overlappable summarizers are not supported" ) } override val requiredColumns = factory1.requiredColumns ++ factory2.requiredColumns def apply(inputSchema: StructType): Summarizer = { val summarizer1 = factory1.apply(inputSchema) val summarizer2 = factory2.apply(inputSchema) new CompositeSummarizer(inputSchema, prefixOpt, requiredColumns, summarizer1, summarizer2) } } class CompositeSummarizer( override val inputSchema: StructType, override val prefixOpt: Option[String], override val requiredColumns: ColumnList, val summarizer1: Summarizer, val summarizer2: Summarizer ) extends Summarizer with InputAlwaysValid { override type T = (InternalRow, InternalRow) override type U = (Any, Any) override type V = (InternalRow, InternalRow) override val schema: StructType = StructType(summarizer1.outputSchema.fields ++ summarizer2.outputSchema.fields) override val summarizer = com.twosigma.flint.rdd.function.summarize.summarizer.CompositeSummarizer(summarizer1, summarizer2) requireNoDuplicateColumns(outputSchema) // Convert the output of `summarizer` to the InternalRow. override def fromV(v: V): InternalRow = { val (r1, r2) = v new GenericInternalRow((r1.toSeq(summarizer1.outputSchema) ++ r2.toSeq(summarizer2.outputSchema)).toArray) } // Convert the InternalRow to the type of row expected by the `summarizer`. override def toT(r: InternalRow): T = (r, r) private def requireNoDuplicateColumns(schema: StructType): Unit = { try { Schema.requireUniqueColumnNames(schema) } catch { case e: DuplicateColumnsException => throw new DuplicateColumnsException( s"Found conflict output columns: ${e.duplicates}. Use prefix() to rename conflict summarizers to be composed", e.duplicates ) } } }
Example 46
Source File: MLMatrixSerializer.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.util import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.matfast.matrix._ object MLMatrixSerializer { def serialize(obj: MLMatrix): InternalRow = { val row = new GenericInternalRow(7) obj match { case sm: SparseMatrix => row.setByte(0, 0) row.setInt(1, sm.numRows) row.setInt(2, sm.numCols) row.update(3, UnsafeArrayData.fromPrimitiveArray(sm.colPtrs)) row.update(4, UnsafeArrayData.fromPrimitiveArray(sm.rowIndices)) row.update(5, UnsafeArrayData.fromPrimitiveArray(sm.values)) row.setBoolean(6, sm.isTransposed) case dm: DenseMatrix => row.setByte(0, 1) row.setInt(1, dm.numRows) row.setInt(2, dm.numCols) row.setNullAt(3) row.setNullAt(4) row.update(5, UnsafeArrayData.fromPrimitiveArray(dm.values)) row.setBoolean(6, dm.isTransposed) } row } def deserialize(datum: Any): MLMatrix = { datum match { case row: InternalRow => require(row.numFields == 7, s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7") val tpe = row.getByte(0) val numRows = row.getInt(1) val numCols = row.getInt(2) val values = row.getArray(5).toDoubleArray() val isTransposed = row.getBoolean(6) tpe match { case 0 => val colPtrs = row.getArray(3).toIntArray() val rowIndices = row.getArray(4).toIntArray() new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) case 1 => new DenseMatrix(numRows, numCols, values, isTransposed) } } } } class MLMatrixSerializer { }
Example 47
Source File: ColumnarTestUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 48
Source File: NullableColumnAccessorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 49
Source File: ColumnStatsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types._ class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[BooleanColumnStats], BOOLEAN, createRow(true, false, 0)) testColumnStats(classOf[ByteColumnStats], BYTE, createRow(Byte.MaxValue, Byte.MinValue, 0)) testColumnStats(classOf[ShortColumnStats], SHORT, createRow(Short.MaxValue, Short.MinValue, 0)) testColumnStats(classOf[IntColumnStats], INT, createRow(Int.MaxValue, Int.MinValue, 0)) testColumnStats(classOf[LongColumnStats], LONG, createRow(Long.MaxValue, Long.MinValue, 0)) testColumnStats(classOf[FloatColumnStats], FLOAT, createRow(Float.MaxValue, Float.MinValue, 0)) testColumnStats(classOf[DoubleColumnStats], DOUBLE, createRow(Double.MaxValue, Double.MinValue, 0)) testColumnStats(classOf[StringColumnStats], STRING, createRow(null, null, 0)) testDecimalColumnStats(createRow(null, null, 0)) def createRow(values: Any*): GenericInternalRow = new GenericInternalRow(values.toArray) def testColumnStats[T <: AtomicType, U <: ColumnStats]( columnStatsClass: Class[U], columnType: NativeColumnType[T], initialStatistics: GenericInternalRow): Unit = { val columnStatsName = columnStatsClass.getSimpleName test(s"$columnStatsName: empty") { val columnStats = columnStatsClass.newInstance() columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ val columnStats = columnStatsClass.newInstance() val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats]( initialStatistics: GenericInternalRow): Unit = { val columnStatsName = classOf[DecimalColumnStats].getSimpleName val columnType = COMPACT_DECIMAL(15, 10) test(s"$columnStatsName: empty") { val columnStats = new DecimalColumnStats(15, 10) columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ val columnStats = new DecimalColumnStats(15, 10) val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } }
Example 50
Source File: NullableColumnBuilderSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 51
Source File: VectorUDT.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class VectorUDT extends UserDefinedType[Vector] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse // vectors. The "values" field is nullable because we might want to add binary vectors later, // which uses "size" and "indices", but not "values". StructType(Seq( StructField("type", ByteType, nullable = false), StructField("size", IntegerType, nullable = true), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) } override def serialize(obj: Vector): InternalRow = { obj match { case SparseVector(size, indices, values) => val row = new GenericInternalRow(4) row.setByte(0, 0) row.setInt(1, size) row.update(2, UnsafeArrayData.fromPrimitiveArray(indices)) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row case DenseVector(values) => val row = new GenericInternalRow(4) row.setByte(0, 1) row.setNullAt(1) row.setNullAt(2) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row } } override def deserialize(datum: Any): Vector = { datum match { case row: InternalRow => require(row.numFields == 4, s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4") val tpe = row.getByte(0) tpe match { case 0 => val size = row.getInt(1) val indices = row.getArray(2).toIntArray() val values = row.getArray(3).toDoubleArray() new SparseVector(size, indices, values) case 1 => val values = row.getArray(3).toDoubleArray() new DenseVector(values) } } } override def pyUDT: String = "pyspark.ml.linalg.VectorUDT" override def userClass: Class[Vector] = classOf[Vector] override def equals(o: Any): Boolean = { o match { case v: VectorUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[VectorUDT].getName.hashCode() override def typeName: String = "vector" private[spark] override def asNullable: VectorUDT = this }
Example 52
Source File: PolyLineUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class PolyLineUDT extends UserDefinedType[PolyLine] with GeometricUDT { override val sqlType: StructType = StructType( Seq( StructField("type", IntegerType, nullable = false), StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("xcoordinates", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("ycoordinates", ArrayType(DoubleType, containsNull = false), nullable = true) )) override def serialize(polyLine: PolyLine): InternalRow = { polyLine.serialize() } override def serialize(shape: Shape) = serialize(shape.asInstanceOf[PolyLine]) override def userClass: Class[PolyLine] = classOf[PolyLine] override def deserialize(datum: Any): PolyLine = { val row = datum.asInstanceOf[InternalRow] val polyline = new PolyLine() polyline.init(row) polyline } override def pyUDT: String = "magellan.types.PolyLineUDT" override val geometryType = new PolyLine().getType() }
Example 53
Source File: NullableColumnAccessorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 54
Source File: ColumnStatsSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.types._ class ColumnStatsSuite extends SparkFunSuite { testColumnStats(classOf[BooleanColumnStats], BOOLEAN, createRow(true, false, 0)) testColumnStats(classOf[ByteColumnStats], BYTE, createRow(Byte.MaxValue, Byte.MinValue, 0)) testColumnStats(classOf[ShortColumnStats], SHORT, createRow(Short.MaxValue, Short.MinValue, 0)) testColumnStats(classOf[IntColumnStats], INT, createRow(Int.MaxValue, Int.MinValue, 0)) testColumnStats(classOf[LongColumnStats], LONG, createRow(Long.MaxValue, Long.MinValue, 0)) testColumnStats(classOf[FloatColumnStats], FLOAT, createRow(Float.MaxValue, Float.MinValue, 0)) testColumnStats(classOf[DoubleColumnStats], DOUBLE, createRow(Double.MaxValue, Double.MinValue, 0)) testColumnStats(classOf[StringColumnStats], STRING, createRow(null, null, 0)) testDecimalColumnStats(createRow(null, null, 0)) def createRow(values: Any*): GenericInternalRow = new GenericInternalRow(values.toArray) def testColumnStats[T <: AtomicType, U <: ColumnStats]( columnStatsClass: Class[U], columnType: NativeColumnType[T], initialStatistics: GenericInternalRow): Unit = { val columnStatsName = columnStatsClass.getSimpleName test(s"$columnStatsName: empty") { val columnStats = columnStatsClass.newInstance() columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ val columnStats = columnStatsClass.newInstance() val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } def testDecimalColumnStats[T <: AtomicType, U <: ColumnStats]( initialStatistics: GenericInternalRow): Unit = { val columnStatsName = classOf[DecimalColumnStats].getSimpleName val columnType = COMPACT_DECIMAL(15, 10) test(s"$columnStatsName: empty") { val columnStats = new DecimalColumnStats(15, 10) columnStats.collectedStatistics.values.zip(initialStatistics.values).foreach { case (actual, expected) => assert(actual === expected) } } test(s"$columnStatsName: non-empty") { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ val columnStats = new DecimalColumnStats(15, 10) val rows = Seq.fill(10)(makeRandomRow(columnType)) ++ Seq.fill(10)(makeNullRow(1)) rows.foreach(columnStats.gatherStats(_, 0)) val values = rows.take(10).map(_.get(0, columnType.dataType).asInstanceOf[T#InternalType]) val ordering = columnType.dataType.ordering.asInstanceOf[Ordering[T#InternalType]] val stats = columnStats.collectedStatistics assertResult(values.min(ordering), "Wrong lower bound")(stats.values(0)) assertResult(values.max(ordering), "Wrong upper bound")(stats.values(1)) assertResult(10, "Wrong null count")(stats.values(2)) assertResult(20, "Wrong row count")(stats.values(3)) assertResult(stats.values(4), "Wrong size in bytes") { rows.map { row => if (row.isNullAt(0)) 4 else columnType.actualSize(row, 0) }.sum } } } }
Example 55
Source File: NullableColumnBuilderSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 56
Source File: VectorUDT.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class VectorUDT extends UserDefinedType[Vector] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // We only use "values" for dense vectors, and "size", "indices", and "values" for sparse // vectors. The "values" field is nullable because we might want to add binary vectors later, // which uses "size" and "indices", but not "values". StructType(Seq( StructField("type", ByteType, nullable = false), StructField("size", IntegerType, nullable = true), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true))) } override def serialize(obj: Vector): InternalRow = { obj match { case SparseVector(size, indices, values) => val row = new GenericInternalRow(4) row.setByte(0, 0) row.setInt(1, size) row.update(2, UnsafeArrayData.fromPrimitiveArray(indices)) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row case DenseVector(values) => val row = new GenericInternalRow(4) row.setByte(0, 1) row.setNullAt(1) row.setNullAt(2) row.update(3, UnsafeArrayData.fromPrimitiveArray(values)) row } } override def deserialize(datum: Any): Vector = { datum match { case row: InternalRow => require(row.numFields == 4, s"VectorUDT.deserialize given row with length ${row.numFields} but requires length == 4") val tpe = row.getByte(0) tpe match { case 0 => val size = row.getInt(1) val indices = row.getArray(2).toIntArray() val values = row.getArray(3).toDoubleArray() new SparseVector(size, indices, values) case 1 => val values = row.getArray(3).toDoubleArray() new DenseVector(values) } } } override def pyUDT: String = "pyspark.ml.linalg.VectorUDT" override def userClass: Class[Vector] = classOf[Vector] override def equals(o: Any): Boolean = { o match { case v: VectorUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[VectorUDT].getName.hashCode() override def typeName: String = "vector" private[spark] override def asNullable: VectorUDT = this }
Example 57
Source File: MatrixUDT.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.linalg import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeArrayData} import org.apache.spark.sql.types._ private[spark] class MatrixUDT extends UserDefinedType[Matrix] { override def sqlType: StructType = { // type: 0 = sparse, 1 = dense // the dense matrix is built by numRows, numCols, values and isTransposed, all of which are // set as not nullable, except values since in the future, support for binary matrices might // be added for which values are not needed. // the sparse matrix needs colPtrs and rowIndices, which are set as // null, while building the dense matrix. StructType(Seq( StructField("type", ByteType, nullable = false), StructField("numRows", IntegerType, nullable = false), StructField("numCols", IntegerType, nullable = false), StructField("colPtrs", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("rowIndices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("values", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("isTransposed", BooleanType, nullable = false) )) } override def serialize(obj: Matrix): InternalRow = { val row = new GenericInternalRow(7) obj match { case sm: SparseMatrix => row.setByte(0, 0) row.setInt(1, sm.numRows) row.setInt(2, sm.numCols) row.update(3, UnsafeArrayData.fromPrimitiveArray(sm.colPtrs)) row.update(4, UnsafeArrayData.fromPrimitiveArray(sm.rowIndices)) row.update(5, UnsafeArrayData.fromPrimitiveArray(sm.values)) row.setBoolean(6, sm.isTransposed) case dm: DenseMatrix => row.setByte(0, 1) row.setInt(1, dm.numRows) row.setInt(2, dm.numCols) row.setNullAt(3) row.setNullAt(4) row.update(5, UnsafeArrayData.fromPrimitiveArray(dm.values)) row.setBoolean(6, dm.isTransposed) } row } override def deserialize(datum: Any): Matrix = { datum match { case row: InternalRow => require(row.numFields == 7, s"MatrixUDT.deserialize given row with length ${row.numFields} but requires length == 7") val tpe = row.getByte(0) val numRows = row.getInt(1) val numCols = row.getInt(2) val values = row.getArray(5).toDoubleArray() val isTransposed = row.getBoolean(6) tpe match { case 0 => val colPtrs = row.getArray(3).toIntArray() val rowIndices = row.getArray(4).toIntArray() new SparseMatrix(numRows, numCols, colPtrs, rowIndices, values, isTransposed) case 1 => new DenseMatrix(numRows, numCols, values, isTransposed) } } } override def userClass: Class[Matrix] = classOf[Matrix] override def equals(o: Any): Boolean = { o match { case v: MatrixUDT => true case _ => false } } // see [SPARK-8647], this achieves the needed constant hash code without constant no. override def hashCode(): Int = classOf[MatrixUDT].getName.hashCode() override def typeName: String = "matrix" override def pyUDT: String = "pyspark.ml.linalg.MatrixUDT" private[spark] override def asNullable: MatrixUDT = this }
Example 58
Source File: TransformerSuite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.catalyst import magellan.TestingUtils._ import magellan.{MockPointExpr, Point, TestSparkContext} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Transformer} import org.apache.spark.sql.magellan.dsl.expressions._ import org.scalatest.FunSuite class TransformerSuite extends FunSuite with TestSparkContext { test("transform") { val sqlCtx = this.sqlContext val path = this.getClass.getClassLoader.getResource("testpoint/").getPath val df = sqlCtx.read.format("magellan").load(path) import sqlCtx.implicits._ val dbl = (x: Point) => Point(2 * x.getX(), 2 * x.getY()) val point = df.withColumn("transformed", $"point".transform(dbl)) .select($"transformed") .first()(0).asInstanceOf[Point] assert(point.getX() ~== -199.0 absTol 1.0) } test("eval: transform") { val fn = (p: Point) => Point(2 * p.getX(), 2 * p.getY()) val expr = Transformer(MockPointExpr(Point(1.0, 2.0)), fn) val result = expr.eval(null).asInstanceOf[InternalRow] // skip the type assert(result.getDouble(1) === 2.0) assert(result.getDouble(2) === 4.0) } }
Example 59
Source File: IndexerSuite.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.catalyst import magellan.{MockPointExpr, Point, TestSparkContext} import magellan.index.ZOrderCurve import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, Indexer} import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.magellan.dsl.expressions._ import org.scalatest.FunSuite class IndexerSuite extends FunSuite with TestSparkContext { test("index points") { val sqlCtx = this.sqlContext val path = this.getClass.getClassLoader.getResource("testpoint/").getPath val df = sqlCtx.read.format("magellan").load(path) import sqlCtx.implicits._ val index = df.withColumn("index", $"point" index 25) .select($"index.curve") .take(1)(0)(0) .asInstanceOf[Seq[ZOrderCurve]] assert(index.map(_.toBase32()) === Seq("9z109")) try { df.withColumn("index", $"point" index 23) assert(false) } catch { case e: Error => assert(true) } } test("eval: Index") { val indexer = Indexer(MockPointExpr(Point(-122.3959313, 37.7912976)), 25) val result = indexer.eval(null).asInstanceOf[GenericArrayData] assert(result.numElements() === 1) val resultRow = result.get(0, Indexer.dataType).asInstanceOf[GenericInternalRow] val indexUDT = Indexer.indexUDT val curve = indexUDT.deserialize(resultRow.get(0, indexUDT)) assert(curve.toBase32() === "9q8yy") val relation = resultRow.getString(1) assert(relation === "Contains") } }
Example 60
Source File: NullableColumnBuilderSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 61
Source File: PolygonUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class PolygonUDT extends UserDefinedType[Polygon] with GeometricUDT { override val sqlType: StructType = StructType(Seq( StructField("type", IntegerType, nullable = false), StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("indices", ArrayType(IntegerType, containsNull = false), nullable = true), StructField("xcoordinates", ArrayType(DoubleType, containsNull = false), nullable = true), StructField("ycoordinates", ArrayType(DoubleType, containsNull = false), nullable = true) )) override def serialize(polygon: Polygon): InternalRow = { polygon.serialize() } override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Polygon]) override def userClass: Class[Polygon] = classOf[Polygon] override def deserialize(datum: Any): Polygon = { val row = datum.asInstanceOf[InternalRow] val polygon = new Polygon() polygon.init(row) polygon } override val geometryType = new Polygon().getType() }
Example 62
Source File: LineUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class LineUDT extends UserDefinedType[Line] with GeometricUDT { override def sqlType: DataType = StructType( Seq( StructField("type", IntegerType, nullable = false), StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("startX", DoubleType, nullable = false), StructField("startY", DoubleType, nullable = false), StructField("endX", DoubleType, nullable = false), StructField("endY", DoubleType, nullable = false) )) override def serialize(line: Line): InternalRow = { val row = new GenericInternalRow(9) row.setInt(0, 2) val BoundingBox(xmin, ymin, xmax, ymax) = line.boundingBox row.setDouble(1, xmin) row.setDouble(2, ymin) row.setDouble(3, xmax) row.setDouble(4, ymax) row.setDouble(5, line.getStart().getX()) row.setDouble(6, line.getStart().getY()) row.setDouble(7, line.getEnd().getX()) row.setDouble(8, line.getEnd().getY()) row } override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Line]) override def userClass: Class[Line] = classOf[Line] override def deserialize(datum: Any): Line = { val row = datum.asInstanceOf[InternalRow] val startX = row.getDouble(5) val startY = row.getDouble(6) val endX = row.getDouble(7) val endY = row.getDouble(8) val line = new Line() val start = Point(startX, startY) val end = Point(endX, endY) line.setStart(start) line.setEnd(end) line } override def pyUDT: String = "magellan.types.LineUDT" override val geometryType = new Line().getType() }
Example 63
Source File: PointUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class PointUDT extends UserDefinedType[Point] with GeometricUDT { override val sqlType: StructType = StructType( Seq( StructField("type", IntegerType, nullable = false), StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("x", DoubleType, nullable = false), StructField("y", DoubleType, nullable = false) )) override def serialize(point: Point): InternalRow = { val row = new GenericInternalRow(7) row.setInt(0, point.getType()) row.setDouble(1, point.getX()) row.setDouble(2, point.getY()) row.setDouble(3, point.getX()) row.setDouble(4, point.getY()) row.setDouble(5, point.getX()) row.setDouble(6, point.getY()) row } override def serialize(shape: Shape) = serialize(shape.asInstanceOf[Point]) override def userClass: Class[Point] = classOf[Point] override def deserialize(datum: Any): Point = { val row = datum.asInstanceOf[InternalRow] require(row.numFields == 7) Point(row.getDouble(5), row.getDouble(6)) } override def pyUDT: String = "magellan.types.PointUDT" def serialize(x: Double, y: Double): InternalRow = { val row = new GenericInternalRow(7) row.setInt(0, 1) row.setDouble(1, x) row.setDouble(2, y) row.setDouble(3, x) row.setDouble(4, y) row.setDouble(5, x) row.setDouble(6, y) row } override val geometryType = new Point().getType() }
Example 64
Source File: ZOrderCurveUDT.scala From magellan with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.types import magellan.BoundingBox import magellan.index.ZOrderCurve import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow class ZOrderCurveUDT extends UserDefinedType[ZOrderCurve] { override def sqlType: DataType = StructType( Seq( StructField("xmin", DoubleType, nullable = false), StructField("ymin", DoubleType, nullable = false), StructField("xmax", DoubleType, nullable = false), StructField("ymax", DoubleType, nullable = false), StructField("precision", IntegerType, nullable = false), StructField("bits", LongType, nullable = false) )) override def serialize(obj: ZOrderCurve): Any = { val row = new GenericInternalRow(6) val BoundingBox(xmin, ymin, xmax, ymax) = obj.boundingBox row.setDouble(0, xmin) row.setDouble(1, ymin) row.setDouble(2, xmax) row.setDouble(3, ymax) row.setInt(4, obj.precision) row.setLong(5, obj.bits) row } override def deserialize(datum: Any): ZOrderCurve = { val row = datum.asInstanceOf[InternalRow] val boundingBox = BoundingBox(row.getDouble(0), row.getDouble(1), row.getDouble(2), row.getDouble(3)) new ZOrderCurve(boundingBox, row.getInt(4), row.getLong(5)) } override def userClass: Class[ZOrderCurve] = classOf[ZOrderCurve] }
Example 65
Source File: ColumnarTestUtils.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import scala.collection.immutable.HashSet import scala.util.Random import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.GenericInternalRow import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{AtomicType, Decimal} import org.apache.spark.unsafe.types.UTF8String object ColumnarTestUtils { def makeNullRow(length: Int): GenericInternalRow = { val row = new GenericInternalRow(length) (0 until length).foreach(row.setNullAt) row } def makeRandomValue[JvmType](columnType: ColumnType[JvmType]): JvmType = { def randomBytes(length: Int) = { val bytes = new Array[Byte](length) Random.nextBytes(bytes) bytes } (columnType match { case NULL => null case BOOLEAN => Random.nextBoolean() case BYTE => (Random.nextInt(Byte.MaxValue * 2) - Byte.MaxValue).toByte case SHORT => (Random.nextInt(Short.MaxValue * 2) - Short.MaxValue).toShort case INT => Random.nextInt() case LONG => Random.nextLong() case FLOAT => Random.nextFloat() case DOUBLE => Random.nextDouble() case STRING => UTF8String.fromString(Random.nextString(Random.nextInt(32))) case BINARY => randomBytes(Random.nextInt(32)) case COMPACT_DECIMAL(precision, scale) => Decimal(Random.nextLong() % 100, precision, scale) case LARGE_DECIMAL(precision, scale) => Decimal(Random.nextLong(), precision, scale) case STRUCT(_) => new GenericInternalRow(Array[Any](UTF8String.fromString(Random.nextString(10)))) case ARRAY(_) => new GenericArrayData(Array[Any](Random.nextInt(), Random.nextInt())) case MAP(_) => ArrayBasedMapData( Map(Random.nextInt() -> UTF8String.fromString(Random.nextString(Random.nextInt(32))))) case _ => throw new IllegalArgumentException(s"Unknown column type $columnType") }).asInstanceOf[JvmType] } def makeRandomValues( head: ColumnType[_], tail: ColumnType[_]*): Seq[Any] = makeRandomValues(Seq(head) ++ tail) def makeRandomValues(columnTypes: Seq[ColumnType[_]]): Seq[Any] = { columnTypes.map(makeRandomValue(_)) } def makeUniqueRandomValues[JvmType]( columnType: ColumnType[JvmType], count: Int): Seq[JvmType] = { Iterator.iterate(HashSet.empty[JvmType]) { set => set + Iterator.continually(makeRandomValue(columnType)).filterNot(set.contains).next() }.drop(count).next().toSeq } def makeRandomRow( head: ColumnType[_], tail: ColumnType[_]*): InternalRow = makeRandomRow(Seq(head) ++ tail) def makeRandomRow(columnTypes: Seq[ColumnType[_]]): InternalRow = { val row = new GenericInternalRow(columnTypes.length) makeRandomValues(columnTypes).zipWithIndex.foreach { case (value, index) => row(index) = value } row } def makeUniqueValuesAndSingleValueRows[T <: AtomicType]( columnType: NativeColumnType[T], count: Int): (Seq[T#InternalType], Seq[GenericInternalRow]) = { val values = makeUniqueRandomValues(columnType, count) val rows = values.map { value => val row = new GenericInternalRow(1) row(0) = value row } (values, rows) } }
Example 66
Source File: NullableColumnAccessorSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 67
Source File: RowSuite.scala From drizzle-spark with Apache License 2.0 | 4 votes |
package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, SpecificInternalRow} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String class RowSuite extends SparkFunSuite with SharedSQLContext { import testImplicits._ test("create row") { val expected = new GenericInternalRow(4) expected.setInt(0, 2147483647) expected.update(1, UTF8String.fromString("this is a string")) expected.setBoolean(2, false) expected.setNullAt(3) val actual1 = Row(2147483647, "this is a string", false, null) assert(expected.numFields === actual1.size) assert(expected.getInt(0) === actual1.getInt(0)) assert(expected.getString(1) === actual1.getString(1)) assert(expected.getBoolean(2) === actual1.getBoolean(2)) assert(expected.isNullAt(3) === actual1.isNullAt(3)) val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null)) assert(expected.numFields === actual2.size) assert(expected.getInt(0) === actual2.getInt(0)) assert(expected.getString(1) === actual2.getString(1)) assert(expected.getBoolean(2) === actual2.getBoolean(2)) assert(expected.isNullAt(3) === actual2.isNullAt(3)) } test("SpecificMutableRow.update with null") { val row = new SpecificInternalRow(Seq(IntegerType)) row(0) = null assert(row.isNullAt(0)) } test("get values by field name on Row created via .toDF") { val row = Seq((1, Seq(1))).toDF("a", "b").first() assert(row.getAs[Int]("a") === 1) assert(row.getAs[Seq[Int]]("b") === Seq(1)) intercept[IllegalArgumentException]{ row.getAs[Int]("c") } } test("float NaN == NaN") { val r1 = Row(Float.NaN) val r2 = Row(Float.NaN) assert(r1 === r2) } test("double NaN == NaN") { val r1 = Row(Double.NaN) val r2 = Row(Double.NaN) assert(r1 === r2) } test("equals and hashCode") { val r1 = Row("Hello") val r2 = Row("Hello") assert(r1 === r2) assert(r1.hashCode() === r2.hashCode()) val r3 = Row("World") assert(r3.hashCode() != r1.hashCode()) } }
Example 68
Source File: RowSuite.scala From XSQL with Apache License 2.0 | 4 votes |
package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, SpecificInternalRow} import org.apache.spark.sql.test.SharedSQLContext import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String class RowSuite extends SparkFunSuite with SharedSQLContext { import testImplicits._ test("create row") { val expected = new GenericInternalRow(4) expected.setInt(0, 2147483647) expected.update(1, UTF8String.fromString("this is a string")) expected.setBoolean(2, false) expected.setNullAt(3) val actual1 = Row(2147483647, "this is a string", false, null) assert(expected.numFields === actual1.size) assert(expected.getInt(0) === actual1.getInt(0)) assert(expected.getString(1) === actual1.getString(1)) assert(expected.getBoolean(2) === actual1.getBoolean(2)) assert(expected.isNullAt(3) === actual1.isNullAt(3)) val actual2 = Row.fromSeq(Seq(2147483647, "this is a string", false, null)) assert(expected.numFields === actual2.size) assert(expected.getInt(0) === actual2.getInt(0)) assert(expected.getString(1) === actual2.getString(1)) assert(expected.getBoolean(2) === actual2.getBoolean(2)) assert(expected.isNullAt(3) === actual2.isNullAt(3)) } test("SpecificMutableRow.update with null") { val row = new SpecificInternalRow(Seq(IntegerType)) row(0) = null assert(row.isNullAt(0)) } test("get values by field name on Row created via .toDF") { val row = Seq((1, Seq(1))).toDF("a", "b").first() assert(row.getAs[Int]("a") === 1) assert(row.getAs[Seq[Int]]("b") === Seq(1)) intercept[IllegalArgumentException]{ row.getAs[Int]("c") } } test("float NaN == NaN") { val r1 = Row(Float.NaN) val r2 = Row(Float.NaN) assert(r1 === r2) } test("double NaN == NaN") { val r1 = Row(Double.NaN) val r2 = Row(Double.NaN) assert(r1 === r2) } test("equals and hashCode") { val r1 = Row("Hello") val r2 = Row("Hello") assert(r1 === r2) assert(r1.hashCode() === r2.hashCode()) val r3 = Row("World") assert(r3.hashCode() != r1.hashCode()) } }