org.apache.spark.sql.catalyst.CatalystTypeConverters Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.CatalystTypeConverters.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: LocalRelation.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation( output: Seq[Attribute], data: Seq[InternalRow] = Nil, // Indicates whether this relation has data from a streaming source. override val isStreaming: Boolean = false) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def computeStats(): Statistics = Statistics(sizeInBytes = output.map(n => BigInt(n.dataType.defaultSize)).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 2
Source File: RandomDataGeneratorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.types._ def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) val generator = RandomDataGenerator.forType(dataType, nullable, new Random(33)).getOrElse { fail(s"Random data generator was not defined for $dataType") } if (nullable) { assert(Iterator.fill(100)(generator()).contains(null)) } else { assert(!Iterator.fill(100)(generator()).contains(null)) } for (_ <- 1 to 10) { val generatedValue = generator() toCatalyst(generatedValue) } } // Basic types: for ( dataType <- DataTypeTestUtils.atomicTypes; nullable <- Seq(true, false) if !dataType.isInstanceOf[DecimalType]) { test(s"$dataType (nullable=$nullable)") { testRandomDataGeneration(dataType) } } for ( arrayType <- DataTypeTestUtils.atomicArrayTypes if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined ) { test(s"$arrayType") { testRandomDataGeneration(arrayType) } } val atomicTypesWithDataGenerators = DataTypeTestUtils.atomicTypes.filter(RandomDataGenerator.forType(_).isDefined) // Complex types: for ( keyType <- atomicTypesWithDataGenerators; valueType <- atomicTypesWithDataGenerators // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) and // Spark can hit NumberFormatException errors when converting certain BigDecimals (SPARK-8802). // For these reasons, we don't support generation of maps with decimal keys. if !keyType.isInstanceOf[DecimalType] ) { val mapType = MapType(keyType, valueType) test(s"$mapType") { testRandomDataGeneration(mapType) } } for ( colOneType <- atomicTypesWithDataGenerators; colTwoType <- atomicTypesWithDataGenerators ) { val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) test(s"$structType") { testRandomDataGeneration(structType) } } test("check size of generated map") { val mapType = MapType(IntegerType, IntegerType) for (seed <- 1 to 1000) { val generator = RandomDataGenerator.forType( mapType, nullable = false, rand = new Random(seed)).get val maps = Seq.fill(100)(generator().asInstanceOf[Map[Int, Int]]) val expectedTotalElements = 100 / 2 * RandomDataGenerator.MAX_MAP_SIZE val deviation = math.abs(maps.map(_.size).sum - expectedTotalElements) assert(deviation.toDouble / expectedTotalElements < 2e-1) } } }
Example 3
Source File: GenerateUnsafeRowJoinerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val random = new Random() val schema1 = RandomDataGenerator.randomSchema(random, numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(random, numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 4
Source File: commands.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 5
Source File: NullableColumnBuilderSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 6
Source File: NullableColumnAccessorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 7
Source File: LocalRelation.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, analysis} import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.types.{StructType, StructField} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[Row])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[Row] = Nil) extends LeafNode with analysis.MultiInstanceRelation { override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs = Iterator(output) override def sameResult(plan: LogicalPlan): Boolean = plan match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) }
Example 8
Source File: literals.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import java.sql.{Date, Timestamp} import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.util.DateUtils import org.apache.spark.sql.types._ object Literal { def apply(v: Any): Literal = v match { case i: Int => Literal(i, IntegerType) case l: Long => Literal(l, LongType) case d: Double => Literal(d, DoubleType) case f: Float => Literal(f, FloatType) case b: Byte => Literal(b, ByteType) case s: Short => Literal(s, ShortType) case s: String => Literal(UTF8String(s), StringType) case b: Boolean => Literal(b, BooleanType) case d: BigDecimal => Literal(Decimal(d), DecimalType.Unlimited) case d: java.math.BigDecimal => Literal(Decimal(d), DecimalType.Unlimited) case d: Decimal => Literal(d, DecimalType.Unlimited) case t: Timestamp => Literal(t, TimestampType) case d: Date => Literal(DateUtils.fromJavaDate(d), DateType) case a: Array[Byte] => Literal(a, BinaryType) case null => Literal(null, NullType) case _ => throw new RuntimeException("Unsupported literal type " + v.getClass + " " + v) } def create(v: Any, dataType: DataType): Literal = { Literal(CatalystTypeConverters.convertToCatalyst(v), dataType) } } case class Literal protected (value: Any, dataType: DataType) extends LeafExpression { override def foldable: Boolean = true override def nullable: Boolean = value == null override def toString: String = if (value != null) value.toString else "null" type EvaluatedType = Any override def eval(input: Row): Any = value } // TODO: Specialize case class MutableLiteral(var value: Any, dataType: DataType, nullable: Boolean = true) extends LeafExpression { type EvaluatedType = Any def update(expression: Expression, input: Row): Unit = { value = expression.eval(input) } override def eval(input: Row): Any = value }
Example 9
Source File: GeneratedMutableEvaluationSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.codegen._ class GeneratedMutableEvaluationSuite extends ExpressionEvaluationSuite { override def checkEvaluation( expression: Expression, expected: Any, inputRow: Row = EmptyRow): Unit = { lazy val evaluated = GenerateProjection.expressionEvaluator(expression) val plan = try { GenerateProjection.generate(Alias(expression, s"Optimized($expression)")() :: Nil) } catch { case e: Throwable => fail( s""" |Code generation of $expression failed: |${evaluated.code.mkString("\n")} |$e """.stripMargin) } val actual = plan(inputRow) val expectedRow = new GenericRow(Array[Any](CatalystTypeConverters.convertToCatalyst(expected))) if (actual.hashCode() != expectedRow.hashCode()) { fail( s""" |Mismatched hashCodes for values: $actual, $expectedRow |Hash Codes: ${actual.hashCode()} != ${expectedRow.hashCode()} |${evaluated.code.mkString("\n")} """.stripMargin) } if (actual != expectedRow) { val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" fail(s"Incorrect Evaluation: $expression, actual: $actual, expected: $expected$input") } } }
Example 10
Source File: ExistingRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} private[sql] case class LogicalLocalTable(output: Seq[Attribute], rows: Seq[Row])(sqlContext: SQLContext) extends LogicalPlan with MultiInstanceRelation { override def children: Seq[LogicalPlan] = Nil override def newInstance(): this.type = LogicalLocalTable(output.map(_.newInstance()), rows)(sqlContext).asInstanceOf[this.type] override def sameResult(plan: LogicalPlan): Boolean = plan match { case LogicalRDD(_, otherRDD) => rows == rows case _ => false } @transient override lazy val statistics: Statistics = Statistics( // TODO: Improve the statistics estimation. // This is made small enough so it can be broadcasted. sizeInBytes = sqlContext.conf.autoBroadcastJoinThreshold - 1 ) }
Example 11
Source File: LocalRelation.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs = Iterator(output) override def sameResult(plan: LogicalPlan): Boolean = plan match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) }
Example 12
Source File: RandomDataGeneratorSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.types._ def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) val generator = RandomDataGenerator.forType(dataType, nullable, Some(33)).getOrElse { fail(s"Random data generator was not defined for $dataType") } if (nullable) { assert(Iterator.fill(100)(generator()).contains(null)) } else { assert(Iterator.fill(100)(generator()).forall(_ != null)) } for (_ <- 1 to 10) { val generatedValue = generator() toCatalyst(generatedValue) } } // Basic types: for ( dataType <- DataTypeTestUtils.atomicTypes; nullable <- Seq(true, false) if !dataType.isInstanceOf[DecimalType]) { test(s"$dataType (nullable=$nullable)") { testRandomDataGeneration(dataType) } } for ( arrayType <- DataTypeTestUtils.atomicArrayTypes if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined ) { test(s"$arrayType") { testRandomDataGeneration(arrayType) } } val atomicTypesWithDataGenerators = DataTypeTestUtils.atomicTypes.filter(RandomDataGenerator.forType(_).isDefined) // Complex types: for ( keyType <- atomicTypesWithDataGenerators; valueType <- atomicTypesWithDataGenerators // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) and // Spark can hit NumberFormatException errors when converting certain BigDecimals (SPARK-8802). // For these reasons, we don't support generation of maps with decimal keys. if !keyType.isInstanceOf[DecimalType] ) { val mapType = MapType(keyType, valueType) test(s"$mapType") { testRandomDataGeneration(mapType) } } for ( colOneType <- atomicTypesWithDataGenerators; colTwoType <- atomicTypesWithDataGenerators ) { val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) test(s"$structType") { testRandomDataGeneration(structType) } } }
Example 13
Source File: GenerateUnsafeRowJoinerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) //简单的固定宽度类型 test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } //随机化的固定宽度类型 test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } //简单变量宽度类型 test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } //随机变量宽度类型 test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val schema1 = RandomDataGenerator.randomSchema(numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. //创建从外部行转换为内部行和UnsafeRows所需的转换器 val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. //创建输入行,将它们转换成UnsafeRows val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 14
Source File: LocalRelation.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 15
Source File: RandomDataGeneratorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.types._ def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) val generator = RandomDataGenerator.forType(dataType, nullable, new Random(33)).getOrElse { fail(s"Random data generator was not defined for $dataType") } if (nullable) { assert(Iterator.fill(100)(generator()).contains(null)) } else { assert(!Iterator.fill(100)(generator()).contains(null)) } for (_ <- 1 to 10) { val generatedValue = generator() toCatalyst(generatedValue) } } // Basic types: for ( dataType <- DataTypeTestUtils.atomicTypes; nullable <- Seq(true, false) if !dataType.isInstanceOf[DecimalType]) { test(s"$dataType (nullable=$nullable)") { testRandomDataGeneration(dataType) } } for ( arrayType <- DataTypeTestUtils.atomicArrayTypes if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined ) { test(s"$arrayType") { testRandomDataGeneration(arrayType) } } val atomicTypesWithDataGenerators = DataTypeTestUtils.atomicTypes.filter(RandomDataGenerator.forType(_).isDefined) // Complex types: for ( keyType <- atomicTypesWithDataGenerators; valueType <- atomicTypesWithDataGenerators // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) and // Spark can hit NumberFormatException errors when converting certain BigDecimals (SPARK-8802). // For these reasons, we don't support generation of maps with decimal keys. if !keyType.isInstanceOf[DecimalType] ) { val mapType = MapType(keyType, valueType) test(s"$mapType") { testRandomDataGeneration(mapType) } } for ( colOneType <- atomicTypesWithDataGenerators; colTwoType <- atomicTypesWithDataGenerators ) { val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) test(s"$structType") { testRandomDataGeneration(structType) } } test("check size of generated map") { val mapType = MapType(IntegerType, IntegerType) for (seed <- 1 to 1000) { val generator = RandomDataGenerator.forType( mapType, nullable = false, rand = new Random(seed)).get val maps = Seq.fill(100)(generator().asInstanceOf[Map[Int, Int]]) val expectedTotalElements = 100 / 2 * RandomDataGenerator.MAX_MAP_SIZE val deviation = math.abs(maps.map(_.size).sum - expectedTotalElements) assert(deviation.toDouble / expectedTotalElements < 2e-1) } } }
Example 16
Source File: NullableColumnBuilderSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 17
Source File: NullableColumnAccessorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 18
Source File: SageMakerProtobufWriter.scala From sagemaker-spark with Apache License 2.0 | 5 votes |
package com.amazonaws.services.sagemaker.sparksdk.protobuf import java.io.ByteArrayOutputStream import org.apache.hadoop.fs.Path import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.hadoop.mapreduce.{RecordWriter, TaskAttemptContext} import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types.StructType def write(row: Row): Unit = { val labelColumnName = options.getOrElse("labelColumnName", "label") val featuresColumnName = options.getOrElse("featuresColumnName", "features") val record = ProtobufConverter.rowToProtobuf(row, featuresColumnName, Some(labelColumnName)) record.writeTo(byteArrayOutputStream) recordWriter.write(NullWritable.get(), new BytesWritable(byteArrayOutputStream.toByteArray)) byteArrayOutputStream.reset() } override def close(): Unit = { recordWriter.close(context) } }
Example 19
Source File: LocalRelation.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow, analysis} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs = Iterator(output) override def sameResult(plan: LogicalPlan): Boolean = plan match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) }
Example 20
Source File: RandomDataGeneratorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.types._ def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) val generator = RandomDataGenerator.forType(dataType, nullable, Some(33)).getOrElse { fail(s"Random data generator was not defined for $dataType") } if (nullable) { assert(Iterator.fill(100)(generator()).contains(null)) } else { assert(Iterator.fill(100)(generator()).forall(_ != null)) } for (_ <- 1 to 10) { val generatedValue = generator() toCatalyst(generatedValue) } } // Basic types: for ( dataType <- DataTypeTestUtils.atomicTypes; nullable <- Seq(true, false) if !dataType.isInstanceOf[DecimalType]) { test(s"$dataType (nullable=$nullable)") { testRandomDataGeneration(dataType) } } for ( arrayType <- DataTypeTestUtils.atomicArrayTypes if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined ) { test(s"$arrayType") { testRandomDataGeneration(arrayType) } } val atomicTypesWithDataGenerators = DataTypeTestUtils.atomicTypes.filter(RandomDataGenerator.forType(_).isDefined) // Complex types: for ( keyType <- atomicTypesWithDataGenerators; valueType <- atomicTypesWithDataGenerators // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) and // Spark can hit NumberFormatException errors when converting certain BigDecimals (SPARK-8802). // For these reasons, we don't support generation of maps with decimal keys. if !keyType.isInstanceOf[DecimalType] ) { val mapType = MapType(keyType, valueType) test(s"$mapType") { testRandomDataGeneration(mapType) } } for ( colOneType <- atomicTypesWithDataGenerators; colTwoType <- atomicTypesWithDataGenerators ) { val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) test(s"$structType") { testRandomDataGeneration(structType) } } }
Example 21
Source File: GenerateUnsafeRowJoinerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val schema1 = RandomDataGenerator.randomSchema(numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 22
Source File: ExistingRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.{InternalRow, CatalystTypeConverters} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.{Attribute, GenericMutableRow} import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.sources.{HadoopFsRelation, BaseRelation} import org.apache.spark.sql.types.DataType import org.apache.spark.sql.{Row, SQLContext} object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericMutableRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } //private[sql] case class PhysicalRDD( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String, override val metadata: Map[String, String] = Map.empty, override val outputsUnsafeRows: Boolean = false) extends LeafNode { protected override def doExecute(): RDD[InternalRow] = rdd override def simpleString: String = { val metadataEntries = for ((key, value) <- metadata.toSeq.sorted) yield s"$key: $value" s"Scan $nodeName${output.mkString("[", ",", "]")}${metadataEntries.mkString(" ", ", ", "")}" } } private[sql] object PhysicalRDD { // Metadata keys val INPUT_PATHS = "InputPaths" val PUSHED_FILTERS = "PushedFilters" def createFromDataSource( output: Seq[Attribute], rdd: RDD[InternalRow], relation: BaseRelation, metadata: Map[String, String] = Map.empty): PhysicalRDD = { // All HadoopFsRelations output UnsafeRows val outputUnsafeRows = relation.isInstanceOf[HadoopFsRelation] PhysicalRDD(output, rdd, relation.toString, metadata, outputUnsafeRows) } }
Example 23
Source File: NullableColumnBuilderSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericMutableRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 24
Source File: NullableColumnAccessorSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{UnsafeProjection, GenericMutableRow} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericMutableRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 25
Source File: ObjectMapper.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark.sql import java.beans.Introspector import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{AttributeReference, GenericRowWithSchema} import org.apache.spark.sql.types.StructType object ObjectMapper { def forBean(schema: StructType, beanClass: Class[_]): (AnyRef, Array[String]) => Row = { val beanInfo = Introspector.getBeanInfo(beanClass) val attrs = schema.fields.map(f => AttributeReference(f.name, f.dataType, f.nullable)()) val extractors = beanInfo.getPropertyDescriptors.filterNot(_.getName == "class").map(_.getReadMethod) val methodsToConverts = extractors.zip(attrs).map { case (e, attr) => (e, CatalystTypeConverters.createToCatalystConverter(attr.dataType)) } (from: Any, columns: Array[String]) => { if (columns.nonEmpty) { from match { case _: Array[_] => new GenericRowWithSchema(from.asInstanceOf[Array[Any]], schema) case f: Any => val rowSchema = StructType(Array(schema(columns.head))) new GenericRowWithSchema(Array(f), rowSchema) } } else { new GenericRowWithSchema(methodsToConverts.map { case (e, convert) => val invoke: AnyRef = e.invoke(from) convert(invoke) }, schema) } } } }
Example 26
Source File: NullableColumnBuilderSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 27
Source File: RandomDataGeneratorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.types._ def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) val generator = RandomDataGenerator.forType(dataType, nullable, new Random(33)).getOrElse { fail(s"Random data generator was not defined for $dataType") } if (nullable) { assert(Iterator.fill(100)(generator()).contains(null)) } else { assert(!Iterator.fill(100)(generator()).contains(null)) } for (_ <- 1 to 10) { val generatedValue = generator() toCatalyst(generatedValue) } } // Basic types: for ( dataType <- DataTypeTestUtils.atomicTypes; nullable <- Seq(true, false) if !dataType.isInstanceOf[DecimalType]) { test(s"$dataType (nullable=$nullable)") { testRandomDataGeneration(dataType) } } for ( arrayType <- DataTypeTestUtils.atomicArrayTypes if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined ) { test(s"$arrayType") { testRandomDataGeneration(arrayType) } } val atomicTypesWithDataGenerators = DataTypeTestUtils.atomicTypes.filter(RandomDataGenerator.forType(_).isDefined) // Complex types: for ( keyType <- atomicTypesWithDataGenerators; valueType <- atomicTypesWithDataGenerators // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) and // Spark can hit NumberFormatException errors when converting certain BigDecimals (SPARK-8802). // For these reasons, we don't support generation of maps with decimal keys. if !keyType.isInstanceOf[DecimalType] ) { val mapType = MapType(keyType, valueType) test(s"$mapType") { testRandomDataGeneration(mapType) } } for ( colOneType <- atomicTypesWithDataGenerators; colTwoType <- atomicTypesWithDataGenerators ) { val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) test(s"$structType") { testRandomDataGeneration(structType) } } test("check size of generated map") { val mapType = MapType(IntegerType, IntegerType) for (seed <- 1 to 1000) { val generator = RandomDataGenerator.forType( mapType, nullable = false, rand = new Random(seed)).get val maps = Seq.fill(100)(generator().asInstanceOf[Map[Int, Int]]) val expectedTotalElements = 100 / 2 * RandomDataGenerator.MAX_MAP_SIZE val deviation = math.abs(maps.map(_.size).sum - expectedTotalElements) assert(deviation.toDouble / expectedTotalElements < 2e-1) } } }
Example 28
Source File: GenerateUnsafeRowJoinerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val random = new Random() val schema1 = RandomDataGenerator.randomSchema(random, numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(random, numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 29
Source File: ExistingRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsInternal { iter => val proj = UnsafeProjection.create(schema) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"Scan $nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }
Example 30
Source File: commands.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 31
Source File: NullableColumnBuilderSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 32
Source File: NullableColumnAccessorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 33
Source File: CatalystTypeConvertersWrapper.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.DataType object CatalystTypeConvertersWrapper { def toCatalystRowConverter(dataType: DataType): Row => InternalRow = { CatalystTypeConverters.createToCatalystConverter(dataType)(_).asInstanceOf[InternalRow] } def toScalaRowConverter(dataType: DataType): InternalRow => GenericRowWithSchema = { CatalystTypeConverters.createToScalaConverter(dataType)(_).asInstanceOf[GenericRowWithSchema] } def toCatalystConverter(dataType: DataType): Any => Any = CatalystTypeConverters.createToCatalystConverter(dataType) def toScalaConverter(dataType: DataType): Any => Any = CatalystTypeConverters.createToScalaConverter(dataType) }
Example 34
Source File: TiHandleRDD.scala From tispark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.tispark import com.pingcap.tikv.meta.TiDAGRequest import com.pingcap.tikv.util.RangeSplitter import com.pingcap.tikv.{TiConfiguration, TiSession} import com.pingcap.tispark.utils.TiUtil import com.pingcap.tispark.{TiPartition, TiTableReference} import gnu.trove.list.array.TLongArrayList import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.{Partition, TaskContext, TaskKilledException} import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ class TiHandleRDD( override val dagRequest: TiDAGRequest, override val physicalId: Long, val output: Seq[Attribute], override val tiConf: TiConfiguration, override val tableRef: TiTableReference, @transient private val session: TiSession, @transient private val sparkSession: SparkSession) extends TiRDD(dagRequest, physicalId, tiConf, tableRef, session, sparkSession) { private val outputTypes = output.map(_.dataType) private val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = new Iterator[InternalRow] { checkTimezone() private val tiPartition = split.asInstanceOf[TiPartition] private val session = TiSession.getInstance(tiConf) private val snapshot = session.createSnapshot(dagRequest.getStartTs) private[this] val tasks = tiPartition.tasks private val handleIterator = snapshot.indexHandleRead(dagRequest, tasks) private val regionManager = session.getRegionManager private lazy val handleList = { val lst = new TLongArrayList() handleIterator.asScala.foreach { // Kill the task in case it has been marked as killed. This logic is from // InterruptedIterator, but we inline it here instead of wrapping the iterator in order // to avoid performance overhead. if (context.isInterrupted()) { throw new TaskKilledException } lst.add(_) } lst } // Fetch all handles and group by region id private val regionHandleMap = RangeSplitter .newSplitter(regionManager) .groupByAndSortHandlesByRegionId(physicalId, handleList) .map(x => (x._1.first.getId, x._2)) private val iterator = regionHandleMap.iterator override def hasNext: Boolean = { // Kill the task in case it has been marked as killed. if (context.isInterrupted()) { throw new TaskKilledException } iterator.hasNext } override def next(): InternalRow = { val next = iterator.next val regionId = next._1 val handleList = next._2 // Returns RegionId:[handle1, handle2, handle3...] K-V pair val sparkRow = Row.apply(regionId, handleList.toArray()) TiUtil.rowToInternalRow(sparkRow, outputTypes, converters) } } }
Example 35
Source File: XmlDataToCatalyst.scala From spark-xml with Apache License 2.0 | 5 votes |
package com.databricks.spark.xml import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.codegen.CodegenFallback import org.apache.spark.sql.catalyst.expressions.{ExpectsInputTypes, Expression, UnaryExpression} import org.apache.spark.sql.catalyst.util.GenericArrayData import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String import com.databricks.spark.xml.parsers.StaxXmlParser case class XmlDataToCatalyst( child: Expression, schema: DataType, options: XmlOptions) extends UnaryExpression with CodegenFallback with ExpectsInputTypes { override lazy val dataType: DataType = schema @transient lazy val rowSchema: StructType = schema match { case st: StructType => st case ArrayType(st: StructType, _) => st } override def nullSafeEval(xml: Any): Any = xml match { case string: UTF8String => CatalystTypeConverters.convertToCatalyst( StaxXmlParser.parseColumn(string.toString, rowSchema, options)) case string: String => StaxXmlParser.parseColumn(string, rowSchema, options) case arr: GenericArrayData => CatalystTypeConverters.convertToCatalyst( arr.array.map(s => StaxXmlParser.parseColumn(s.toString, rowSchema, options))) case arr: Array[_] => arr.map(s => StaxXmlParser.parseColumn(s.toString, rowSchema, options)) case _ => null } override def inputTypes: Seq[DataType] = schema match { case _: StructType => Seq(StringType) case ArrayType(_: StructType, _) => Seq(ArrayType(StringType)) } }
Example 36
Source File: LocalRelation.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.catalyst.plans.logical.statsEstimation.EstimationUtils import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data, isStreaming).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def computeStats(): Statistics = Statistics(sizeInBytes = EstimationUtils.getSizePerRow(output) * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 37
Source File: ExistingRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], name: String, override val outputPartitioning: Partitioning = UnknownPartitioning(0), override val outputOrdering: Seq[SortOrder] = Nil) extends LeafExecNode { private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("") override val nodeName: String = s"Scan $name$rddName" override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsWithIndexInternal { (index, iter) => val proj = UnsafeProjection.create(schema) proj.initialize(index) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"$nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }
Example 38
Source File: LocalRelation.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = output.map(_.dataType.defaultSize).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 39
Source File: NullableColumnAccessorSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 40
Source File: HierarchyRowFunctions.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hierarchy import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ import org.apache.spark.sql.types.Node private[hierarchy] case class HierarchyRowFunctions(inputTypes: Seq[DataType]) { private[hierarchy] def rowGet[K](i: Int): Row => K = (row: Row) => row.getAs[K](i) private[hierarchy] def rowInit[K](pk: Row => K, pathDataType: DataType): (Row, Option[Long]) => Row = { (row, myOrdKey) => myOrdKey match { case Some(x) => Row(row.toSeq ++ Seq(Node(List(pk(row)), pathDataType, ordPath = List(x))): _*) case None => Row(row.toSeq ++ Seq(Node(List(pk(row)), pathDataType)): _*) } } private[hierarchy] def rowModifyAndOrder[K](pk: Row => K, pathDataType: DataType): (Row, Row, Option[Long]) => Row = { (left, right, myord) => { val pathComponent: K = pk(right) // TODO(weidner): is myNode a ref/ptr or a copy of node?: val myNode: Node = left.getAs[Node](left.length - 1) val path: Seq[Any] = myNode.path ++ List(pathComponent) var node: Node = null // Node(path, ordPath = myOrdPath) myord match { case Some(ord) => val parentOrdPath = myNode.ordPath match { case x: Seq[Long] => x case _ => List() } node = Node(path, pathDataType, ordPath = parentOrdPath ++ List(ord)) case None => node = Node(path, pathDataType) } Row(right.toSeq :+ node: _*) } } private[hierarchy] def rowModify[K](pk: Row => K, pathDataType: DataType): (Row, Row) => Row = { (left, right) => val pathComponent: K = pk(right) val path: Seq[Any] = left.getAs[Node](left.length - 1).path ++ List(pathComponent) val node: Node = Node(path, pathDataType) Row(right.toSeq :+ node: _*) } private[hierarchy] def rowAppend[K](row: Row, node: Node): Row = { Row(row.toSeq :+ node: _*) } private[hierarchy] def rowStartWhere[K](exp: Expression): Row => Boolean = { row => val numColumns = inputTypes.length val converters = inputTypes.map(CatalystTypeConverters.createToCatalystConverter) val values = Stream.from(0).takeWhile(_ < numColumns).map({ i => converters(i)(row(i)) }) val newRow = InternalRow.fromSeq(values) exp.eval(newRow).asInstanceOf[Boolean] } private[hierarchy] def bindExpression(exp: Expression, attributes: Seq[Attribute]) : Expression = exp.transform { case a: AttributeReference => val index = attributes.indexWhere(_.name == a.name) BoundReference(index, a.dataType, a.nullable) } }
Example 41
Source File: ExpressionEvalHelper.scala From HANAVora-Extensions with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions // // Partially backported from Spark 1.5.2. // import org.apache.spark.sql.extension.OptimizerFactoryForTests import org.apache.spark.sql.catalyst.plans.logical.{OneRowRelation, Project} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.scalactic.TripleEqualsSupport.Spread import org.scalatest.FunSuite import org.scalatest.prop.GeneratorDrivenPropertyChecks // scalastyle:off case _ => } expression.eval(inputRow) } protected def generateProject( generator: => Projection, expression: Expression): Projection = { try { generator } catch { case e: Throwable => fail( s""" |Code generation of $expression failed: |$e |${e.getStackTraceString} """.stripMargin) } } protected def checkEvaluationWithoutCodegen( expression: Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = { val actual = try evaluate(expression, inputRow) catch { case e: Exception => fail(s"Exception evaluating $expression", e) } if (!checkResult(actual, expected)) { val input = if (inputRow == EmptyRow) "" else s", input: $inputRow" fail(s"Incorrect evaluation (codegen off): $expression, " + s"actual: $actual, " + s"expected: $expected$input") } } protected def checkEvaluationWithOptimization( expression: Expression, expected: Any, inputRow: InternalRow = EmptyRow): Unit = { val plan = Project(Alias(expression, s"Optimized($expression)")() :: Nil, OneRowRelation) val optimizedPlan = OptimizerFactoryForTests.default().execute(plan) checkEvaluationWithoutCodegen(optimizedPlan.expressions.head, expected, inputRow) } }
Example 42
Source File: ExcelOutputWriter.scala From spark-hadoopoffice-ds with Apache License 2.0 | 5 votes |
package org.zuinnote.spark.office.excel import java.math.BigDecimal import java.sql.Date import java.sql.Timestamp import java.text.DateFormat import java.text.SimpleDateFormat import java.util.Calendar import org.apache.hadoop.conf.Configuration import org.apache.hadoop.io.NullWritable import org.apache.hadoop.io.ArrayWritable import org.apache.hadoop.mapreduce.RecordWriter import org.apache.hadoop.mapreduce.TaskAttemptContext import org.apache.hadoop.fs.Path import org.apache.spark.sql.catalyst.{ CatalystTypeConverters, InternalRow } import org.apache.spark.sql.Row import org.apache.spark.sql.execution.datasources.OutputWriter import org.apache.spark.sql.types._ import org.zuinnote.hadoop.office.format.common.dao.SpreadSheetCellDAO import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import org.zuinnote.hadoop.office.format.common.util.msexcel.MSExcelUtil import org.zuinnote.hadoop.office.format.mapreduce._ import org.apache.commons.logging.LogFactory import org.apache.commons.logging.Log import org.zuinnote.hadoop.office.format.common.HadoopOfficeWriteConfiguration import java.util.Locale import java.text.DecimalFormat import org.zuinnote.hadoop.office.format.common.converter.ExcelConverterSimpleSpreadSheetCellDAO import java.text.NumberFormat // NOTE: This class is instantiated and used on executor side only, no need to be serializable. private[excel] class ExcelOutputWriter( path: String, dataSchema: StructType, context: TaskAttemptContext, options: Map[String, String]) extends OutputWriter { def write(row: Row): Unit = { // check useHeader if (useHeader) { val headers = row.schema.fieldNames var i = 0 for (x <- headers) { val headerColumnSCD = new SpreadSheetCellDAO(x, "", "", MSExcelUtil.getCellAddressA1Format(currentRowNum, i), defaultSheetName) recordWriter.write(NullWritable.get(), headerColumnSCD) i += 1 } currentRowNum += 1 useHeader = false } // for each value in the row if (row.size>0) { var currentColumnNum = 0; val simpleObject = new Array[AnyRef](row.size) for (i <- 0 to row.size - 1) { // for each element of the row val obj = row.get(i) if ((obj.isInstanceOf[Seq[String]]) && (obj.asInstanceOf[Seq[String]].length==5)) { val formattedValue = obj.asInstanceOf[Seq[String]](0) val comment = obj.asInstanceOf[Seq[String]](1) val formula = obj.asInstanceOf[Seq[String]](2) val address = obj.asInstanceOf[Seq[String]](3) val sheetName = obj.asInstanceOf[Seq[String]](4) simpleObject(i) = new SpreadSheetCellDAO(formattedValue,comment,formula,address,sheetName) } else { simpleObject(i)=obj.asInstanceOf[AnyRef] } } // convert row to spreadsheetcellDAO val spreadSheetCellDAORow = simpleConverter.getSpreadSheetCellDAOfromSimpleDataType(simpleObject, defaultSheetName, currentRowNum) // write it for (x<- spreadSheetCellDAORow) { recordWriter.write(NullWritable.get(), x) } } currentRowNum += 1 } override def close(): Unit = { recordWriter.close(context) currentRowNum = 0; } }
Example 43
Source File: LocalRelation.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis import org.apache.spark.sql.catalyst.expressions.{Attribute, Literal} import org.apache.spark.sql.types.{StructField, StructType} object LocalRelation { def apply(output: Attribute*): LocalRelation = new LocalRelation(output) def apply(output1: StructField, output: StructField*): LocalRelation = { new LocalRelation(StructType(output1 +: output).toAttributes) } def fromExternalRows(output: Seq[Attribute], data: Seq[Row]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } def fromProduct(output: Seq[Attribute], data: Seq[Product]): LocalRelation = { val schema = StructType.fromAttributes(output) val converter = CatalystTypeConverters.createToCatalystConverter(schema) LocalRelation(output, data.map(converter(_).asInstanceOf[InternalRow])) } } case class LocalRelation(output: Seq[Attribute], data: Seq[InternalRow] = Nil) extends LeafNode with analysis.MultiInstanceRelation { // A local relation must have resolved output. require(output.forall(_.resolved), "Unresolved attributes found when constructing LocalRelation.") override final def newInstance(): this.type = { LocalRelation(output.map(_.newInstance()), data).asInstanceOf[this.type] } override protected def stringArgs: Iterator[Any] = { if (data.isEmpty) { Iterator("<empty>", output) } else { Iterator(output) } } override def sameResult(plan: LogicalPlan): Boolean = { plan.canonicalized match { case LocalRelation(otherOutput, otherData) => otherOutput.map(_.dataType) == output.map(_.dataType) && otherData == data case _ => false } } override lazy val statistics = Statistics(sizeInBytes = (output.map(n => BigInt(n.dataType.defaultSize))).sum * data.length) def toSQL(inlineTableName: String): String = { require(data.nonEmpty) val types = output.map(_.dataType) val rows = data.map { row => val cells = row.toSeq(types).zip(types).map { case (v, tpe) => Literal(v, tpe).sql } cells.mkString("(", ", ", ")") } "VALUES " + rows.mkString(", ") + " AS " + inlineTableName + output.map(_.name).mkString("(", ", ", ")") } }
Example 44
Source File: RandomDataGeneratorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.types._ def testRandomDataGeneration(dataType: DataType, nullable: Boolean = true): Unit = { val toCatalyst = CatalystTypeConverters.createToCatalystConverter(dataType) val generator = RandomDataGenerator.forType(dataType, nullable, new Random(33)).getOrElse { fail(s"Random data generator was not defined for $dataType") } if (nullable) { assert(Iterator.fill(100)(generator()).contains(null)) } else { assert(!Iterator.fill(100)(generator()).contains(null)) } for (_ <- 1 to 10) { val generatedValue = generator() toCatalyst(generatedValue) } } // Basic types: for ( dataType <- DataTypeTestUtils.atomicTypes; nullable <- Seq(true, false) if !dataType.isInstanceOf[DecimalType]) { test(s"$dataType (nullable=$nullable)") { testRandomDataGeneration(dataType) } } for ( arrayType <- DataTypeTestUtils.atomicArrayTypes if RandomDataGenerator.forType(arrayType.elementType, arrayType.containsNull).isDefined ) { test(s"$arrayType") { testRandomDataGeneration(arrayType) } } val atomicTypesWithDataGenerators = DataTypeTestUtils.atomicTypes.filter(RandomDataGenerator.forType(_).isDefined) // Complex types: for ( keyType <- atomicTypesWithDataGenerators; valueType <- atomicTypesWithDataGenerators // Scala's BigDecimal.hashCode can lead to OutOfMemoryError on Scala 2.10 (see SI-6173) and // Spark can hit NumberFormatException errors when converting certain BigDecimals (SPARK-8802). // For these reasons, we don't support generation of maps with decimal keys. if !keyType.isInstanceOf[DecimalType] ) { val mapType = MapType(keyType, valueType) test(s"$mapType") { testRandomDataGeneration(mapType) } } for ( colOneType <- atomicTypesWithDataGenerators; colTwoType <- atomicTypesWithDataGenerators ) { val structType = StructType(StructField("a", colOneType) :: StructField("b", colTwoType) :: Nil) test(s"$structType") { testRandomDataGeneration(structType) } } test("check size of generated map") { val mapType = MapType(IntegerType, IntegerType) for (seed <- 1 to 1000) { val generator = RandomDataGenerator.forType( mapType, nullable = false, rand = new Random(seed)).get val maps = Seq.fill(100)(generator().asInstanceOf[Map[Int, Int]]) val expectedTotalElements = 100 / 2 * RandomDataGenerator.MAX_MAP_SIZE val deviation = math.abs(maps.map(_.size).sum - expectedTotalElements) assert(deviation.toDouble / expectedTotalElements < 2e-1) } } }
Example 45
Source File: GenerateUnsafeRowJoinerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions.codegen import scala.util.Random import org.apache.spark.SparkFunSuite import org.apache.spark.sql.RandomDataGenerator import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.expressions.UnsafeProjection import org.apache.spark.sql.types._ class GenerateUnsafeRowJoinerSuite extends SparkFunSuite { private val fixed = Seq(IntegerType) private val variable = Seq(IntegerType, StringType) test("simple fixed width types") { testConcat(0, 0, fixed) testConcat(0, 1, fixed) testConcat(1, 0, fixed) testConcat(64, 0, fixed) testConcat(0, 64, fixed) testConcat(64, 64, fixed) } test("randomized fix width types") { for (i <- 0 until 20) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), fixed) } } test("simple variable width types") { testConcat(0, 0, variable) testConcat(0, 1, variable) testConcat(1, 0, variable) testConcat(64, 0, variable) testConcat(0, 64, variable) testConcat(64, 64, variable) } test("randomized variable width types") { for (i <- 0 until 10) { testConcatOnce(Random.nextInt(100), Random.nextInt(100), variable) } } private def testConcat(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]): Unit = { for (i <- 0 until 10) { testConcatOnce(numFields1, numFields2, candidateTypes) } } private def testConcatOnce(numFields1: Int, numFields2: Int, candidateTypes: Seq[DataType]) { info(s"schema size $numFields1, $numFields2") val random = new Random() val schema1 = RandomDataGenerator.randomSchema(random, numFields1, candidateTypes) val schema2 = RandomDataGenerator.randomSchema(random, numFields2, candidateTypes) // Create the converters needed to convert from external row to internal row and to UnsafeRows. val internalConverter1 = CatalystTypeConverters.createToCatalystConverter(schema1) val internalConverter2 = CatalystTypeConverters.createToCatalystConverter(schema2) val converter1 = UnsafeProjection.create(schema1) val converter2 = UnsafeProjection.create(schema2) // Create the input rows, convert them into UnsafeRows. val extRow1 = RandomDataGenerator.forType(schema1, nullable = false).get.apply() val extRow2 = RandomDataGenerator.forType(schema2, nullable = false).get.apply() val row1 = converter1.apply(internalConverter1.apply(extRow1).asInstanceOf[InternalRow]) val row2 = converter2.apply(internalConverter2.apply(extRow2).asInstanceOf[InternalRow]) // Run the joiner. val mergedSchema = StructType(schema1 ++ schema2) val concater = GenerateUnsafeRowJoiner.create(schema1, schema2) val output = concater.join(row1, row2) // Test everything equals ... for (i <- mergedSchema.indices) { if (i < schema1.size) { assert(output.isNullAt(i) === row1.isNullAt(i)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row1.get(i, mergedSchema(i).dataType)) } } else { assert(output.isNullAt(i) === row2.isNullAt(i - schema1.size)) if (!output.isNullAt(i)) { assert(output.get(i, mergedSchema(i).dataType) === row2.get(i - schema1.size, mergedSchema(i).dataType)) } } } } }
Example 46
Source File: commands.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.command import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.errors.TreeNodeException import org.apache.spark.sql.catalyst.expressions.{Attribute, AttributeReference} import org.apache.spark.sql.catalyst.plans.QueryPlan import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.execution.debug._ import org.apache.spark.sql.execution.streaming.IncrementalExecution import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types._ case class ExplainCommand( logicalPlan: LogicalPlan, override val output: Seq[Attribute] = Seq(AttributeReference("plan", StringType, nullable = true)()), extended: Boolean = false, codegen: Boolean = false) extends RunnableCommand { // Run through the optimizer to generate the physical plan. override def run(sparkSession: SparkSession): Seq[Row] = try { val queryExecution = if (logicalPlan.isStreaming) { // This is used only by explaining `Dataset/DataFrame` created by `spark.readStream`, so the // output mode does not matter since there is no `Sink`. new IncrementalExecution(sparkSession, logicalPlan, OutputMode.Append(), "<unknown>", 0, 0) } else { sparkSession.sessionState.executePlan(logicalPlan) } val outputString = if (codegen) { codegenString(queryExecution.executedPlan) } else if (extended) { queryExecution.toString } else { queryExecution.simpleString } Seq(Row(outputString)) } catch { case cause: TreeNodeException[_] => ("Error occurred during query planning: \n" + cause.getMessage).split("\n").map(Row(_)) } }
Example 47
Source File: NullableColumnBuilderSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnBuilder[JvmType](columnType: ColumnType[JvmType]) extends BasicColumnBuilder[JvmType](new NoopColumnStats, columnType) with NullableColumnBuilder object TestNullableColumnBuilder { def apply[JvmType](columnType: ColumnType[JvmType], initialSize: Int = 0) : TestNullableColumnBuilder[JvmType] = { val builder = new TestNullableColumnBuilder(columnType) builder.initialize(initialSize) builder } } class NullableColumnBuilderSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnBuilder(_) } def testNullableColumnBuilder[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val dataType = columnType.dataType val proj = UnsafeProjection.create(Array[DataType](dataType)) val converter = CatalystTypeConverters.createToScalaConverter(dataType) test(s"$typeName column builder: empty column") { val columnBuilder = TestNullableColumnBuilder(columnType) val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) assert(!buffer.hasRemaining) } test(s"$typeName column builder: buffer size auto growth") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) } val buffer = columnBuilder.build() assertResult(0, "Wrong null count")(buffer.getInt()) } test(s"$typeName column builder: null values") { val columnBuilder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val nullRow = makeNullRow(1) (0 until 4).foreach { _ => columnBuilder.appendFrom(proj(randomRow), 0) columnBuilder.appendFrom(proj(nullRow), 0) } val buffer = columnBuilder.build() assertResult(4, "Wrong null count")(buffer.getInt()) // For null positions (1 to 7 by 2).foreach(assertResult(_, "Wrong null position")(buffer.getInt())) // For non-null values val actual = new GenericInternalRow(new Array[Any](1)) (0 until 4).foreach { _ => columnType.extract(buffer, actual, 0) assert(converter(actual.get(0, dataType)) === converter(randomRow.get(0, dataType)), "Extracted value didn't equal to the original one") } assert(!buffer.hasRemaining) } } }
Example 48
Source File: NullableColumnAccessorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.columnar import java.nio.ByteBuffer import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.CatalystTypeConverters import org.apache.spark.sql.catalyst.expressions.{GenericInternalRow, UnsafeProjection} import org.apache.spark.sql.types._ class TestNullableColumnAccessor[JvmType]( buffer: ByteBuffer, columnType: ColumnType[JvmType]) extends BasicColumnAccessor(buffer, columnType) with NullableColumnAccessor object TestNullableColumnAccessor { def apply[JvmType](buffer: ByteBuffer, columnType: ColumnType[JvmType]) : TestNullableColumnAccessor[JvmType] = { new TestNullableColumnAccessor(buffer, columnType) } } class NullableColumnAccessorSuite extends SparkFunSuite { import org.apache.spark.sql.execution.columnar.ColumnarTestUtils._ Seq( NULL, BOOLEAN, BYTE, SHORT, INT, LONG, FLOAT, DOUBLE, STRING, BINARY, COMPACT_DECIMAL(15, 10), LARGE_DECIMAL(20, 10), STRUCT(StructType(StructField("a", StringType) :: Nil)), ARRAY(ArrayType(IntegerType)), MAP(MapType(IntegerType, StringType))) .foreach { testNullableColumnAccessor(_) } def testNullableColumnAccessor[JvmType]( columnType: ColumnType[JvmType]): Unit = { val typeName = columnType.getClass.getSimpleName.stripSuffix("$") val nullRow = makeNullRow(1) test(s"Nullable $typeName column accessor: empty column") { val builder = TestNullableColumnBuilder(columnType) val accessor = TestNullableColumnAccessor(builder.build(), columnType) assert(!accessor.hasNext) } test(s"Nullable $typeName column accessor: access null values") { val builder = TestNullableColumnBuilder(columnType) val randomRow = makeRandomRow(columnType) val proj = UnsafeProjection.create(Array[DataType](columnType.dataType)) (0 until 4).foreach { _ => builder.appendFrom(proj(randomRow), 0) builder.appendFrom(proj(nullRow), 0) } val accessor = TestNullableColumnAccessor(builder.build(), columnType) val row = new GenericInternalRow(1) val converter = CatalystTypeConverters.createToScalaConverter(columnType.dataType) (0 until 4).foreach { _ => assert(accessor.hasNext) accessor.extractTo(row, 0) assert(converter(row.get(0, columnType.dataType)) === converter(randomRow.get(0, columnType.dataType))) assert(accessor.hasNext) accessor.extractTo(row, 0) assert(row.isNullAt(0)) } assert(!accessor.hasNext) } } }
Example 49
Source File: SparkDateTime.scala From spark-datetime with Apache License 2.0 | 5 votes |
package org.apache.spark.sparklinedata.datetime import org.apache.spark.sql.catalyst.expressions.GenericMutableRow import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.types._ import org.apache.spark.unsafe.types.UTF8String @SQLUserDefinedType(udt = classOf[SparkDateTimeUDT]) case class SparkDateTime(millis : Long, tzId : String) class SparkDateTimeUDT extends UserDefinedType[SparkDateTime] { override def sqlType: DataType = StructType(Seq(StructField("millis", LongType), StructField("tz", StringType))) override def serialize(obj: SparkDateTime): InternalRow = { obj match { case dt: SparkDateTime => val row = new GenericMutableRow(2) row.setLong(0, dt.millis) row.update(1, CatalystTypeConverters.convertToCatalyst(dt.tzId)) row } } override def deserialize(datum: Any): SparkDateTime = { datum match { case row: InternalRow => require(row.numFields == 2, s"SparkDateTimeUDT.deserialize given row with length ${row.numFields} " + s"but requires length == 2") SparkDateTime(row.getLong(0), row.getString(1)) } } override def userClass: Class[SparkDateTime] = classOf[SparkDateTime] override def asNullable: SparkDateTimeUDT = this } @SQLUserDefinedType(udt = classOf[SparkPeriodUDT]) case class SparkPeriod(periodIsoStr : String) class SparkPeriodUDT extends UserDefinedType[SparkPeriod] { override def sqlType: DataType = StringType override def serialize(obj: SparkPeriod): Any = { obj match { case p: SparkPeriod => CatalystTypeConverters.convertToCatalyst(p.periodIsoStr) } } override def deserialize(datum: Any): SparkPeriod = { datum match { case s : UTF8String => SparkPeriod(s.toString()) } } override def userClass: Class[SparkPeriod] = classOf[SparkPeriod] override def asNullable: SparkPeriodUDT = this } @SQLUserDefinedType(udt = classOf[SparkIntervalUDT]) case class SparkInterval(intervalIsoStr : String) class SparkIntervalUDT extends UserDefinedType[SparkInterval] { override def sqlType: DataType = StringType override def serialize(obj: SparkInterval): Any = { obj match { case i: SparkInterval => CatalystTypeConverters.convertToCatalyst(i.intervalIsoStr) } } override def deserialize(datum: Any): SparkInterval = { datum match { case s : UTF8String => SparkInterval(s.toString()) } } override def userClass: Class[SparkInterval] = classOf[SparkInterval] override def asNullable: SparkIntervalUDT = this }