org.apache.spark.sql.catalyst.encoders.ExpressionEncoder Scala Examples
The following examples show how to use org.apache.spark.sql.catalyst.encoders.ExpressionEncoder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkPFASuiteBase.scala From aardpfark with Apache License 2.0 | 6 votes |
package com.ibm.aardpfark.pfa import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.SparkConf import org.apache.spark.ml.Transformer import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions.udf import org.apache.spark.sql.{DataFrame, Row, SparkSession} import org.scalactic.Equality import org.scalatest.FunSuite abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils { val sparkTransformer: Transformer val input: Array[String] val expectedOutput: Array[String] val sparkConf = new SparkConf(). setMaster("local[*]"). setAppName("test"). set("spark.ui.enabled", "false"). set("spark.app.id", appID). set("spark.driver.host", "localhost") override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate() override val reuseContextIfPossible = true // Converts column containing a vector to an array def withColumnAsArray(df: DataFrame, colName: String) = { val vecToArray = udf { v: Vector => v.toArray } df.withColumn(colName, vecToArray(df(colName))) } def withColumnAsArray(df: DataFrame, first: String, others: String*) = { val vecToArray = udf { v: Vector => v.toArray } var result = df.withColumn(first, vecToArray(df(first))) others.foreach(c => result = result.withColumn(c, vecToArray(df(c)))) result } // Converts column containing a vector to a sparse vector represented as a map def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = { val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap } df.withColumn(colName, vecToMap(df(colName))) } } abstract class Result object ApproxEquality extends ApproxEquality trait ApproxEquality { import org.scalactic.Tolerance._ import org.scalactic.TripleEquals._ implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] { override def areEqual(a: Seq[Double], b: Any): Boolean = { b match { case d: Seq[Double] => a.zip(d).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] { override def areEqual(a: Vector, b: Any): Boolean = { b match { case v: Vector => a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 } case _ => false } } } }
Example 2
Source File: DescribeDeltaHistoryCommand.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.commands // scalastyle:off import.ordering.noEmptyLine import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaTableIdentifier} import org.apache.spark.sql.delta.actions.CommitInfo import org.apache.spark.sql.delta.metering.DeltaLogging import org.apache.hadoop.fs.Path import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.catalog.CatalogTableType import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics} import org.apache.spark.sql.execution.command.RunnableCommand case class DescribeDeltaHistoryCommand( path: Option[String], tableIdentifier: Option[TableIdentifier], limit: Option[Int], override val output: Seq[Attribute] = ExpressionEncoder[CommitInfo]().schema.toAttributes) extends RunnableCommand with DeltaLogging { override def run(sparkSession: SparkSession): Seq[Row] = { val basePath = if (path.nonEmpty) { new Path(path.get) } else if (tableIdentifier.nonEmpty) { val sessionCatalog = sparkSession.sessionState.catalog lazy val metadata = sessionCatalog.getTableMetadata(tableIdentifier.get) DeltaTableIdentifier(sparkSession, tableIdentifier.get) match { case Some(id) if id.path.nonEmpty => new Path(id.path.get) case Some(id) if id.table.nonEmpty => new Path(metadata.location) case _ => if (metadata.tableType == CatalogTableType.VIEW) { throw DeltaErrors.describeViewHistory } throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY") } } else { throw DeltaErrors.missingTableIdentifierException("DESCRIBE HISTORY") } // Max array size if (limit.exists(_ > Int.MaxValue - 8)) { throw new IllegalArgumentException("Please use a limit less than Int.MaxValue - 8.") } val deltaLog = DeltaLog.forTable(sparkSession, basePath) recordDeltaOperation(deltaLog, "delta.ddl.describeHistory") { if (deltaLog.snapshot.version == -1) { throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY") } import sparkSession.implicits._ deltaLog.history.getHistory(limit).toDF().collect().toSeq } } }
Example 3
Source File: implicits.scala From spark-dynamodb with Apache License 2.0 | 5 votes |
package com.audienceproject.spark.dynamodb import com.audienceproject.spark.dynamodb.reflect.SchemaAnalysis import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.functions.col import org.apache.spark.sql.types.StructField import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag object implicits { implicit class DynamoDBDataFrameReader(reader: DataFrameReader) { def dynamodb(tableName: String): DataFrame = getDynamoDBSource(tableName).load() def dynamodb(tableName: String, indexName: String): DataFrame = getDynamoDBSource(tableName).option("indexName", indexName).load() def dynamodbAs[T <: Product : ClassTag : TypeTag](tableName: String): Dataset[T] = { implicit val encoder: Encoder[T] = ExpressionEncoder() getColumnsAlias(getDynamoDBSource(tableName) .schema(SchemaAnalysis[T]).load()).as } def dynamodbAs[T <: Product : ClassTag : TypeTag](tableName: String, indexName: String): Dataset[T] = { implicit val encoder: Encoder[T] = ExpressionEncoder() getColumnsAlias(getDynamoDBSource(tableName) .option("indexName", indexName) .schema(SchemaAnalysis[T]).load()).as } private def getDynamoDBSource(tableName: String): DataFrameReader = reader.format("com.audienceproject.spark.dynamodb.datasource").option("tableName", tableName) private def getColumnsAlias(dataFrame: DataFrame): DataFrame = { val columnsAlias = dataFrame.schema.collect({ case StructField(name, _, _, metadata) if metadata.contains("alias") => col(name).as(metadata.getString("alias")) case StructField(name, _, _, _) => col(name) }) dataFrame.select(columnsAlias: _*) } } implicit class DynamoDBDataFrameWriter[T](writer: DataFrameWriter[T]) { def dynamodb(tableName: String): Unit = writer.format("com.audienceproject.spark.dynamodb.datasource").option("tableName", tableName).save() } }
Example 4
Source File: SparkBindings.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.core.schema import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} import org.apache.spark.sql.types.StructType import scala.reflect.runtime.universe.TypeTag abstract class SparkBindings[T: TypeTag] extends Serializable { lazy val schema: StructType = enc.schema private lazy val enc: ExpressionEncoder[T] = ExpressionEncoder[T]().resolveAndBind() private lazy val rowEnc: ExpressionEncoder[Row] = RowEncoder(enc.schema).resolveAndBind() // WARNING: each time you use this function on a dataframe, you should make a new converter. // Spark does some magic that makes this leak memory if re-used on a // different symbolic node of the parallel computation. That being said, // you should make a single converter before using it in a udf so // that the slow resolving and binding is not in the hotpath def makeFromRowConverter: Row => T = { val enc1 = enc.resolveAndBind() val rowEnc1 = rowEnc.resolveAndBind(); { r: Row => enc1.fromRow(rowEnc1.toRow(r)) } } def makeFromInternalRowConverter: InternalRow => T = { val enc1 = enc.resolveAndBind(); { r: InternalRow => enc1.fromRow(r) } } def makeToRowConverter: T => Row = { val enc1 = enc.resolveAndBind() val rowEnc1 = rowEnc.resolveAndBind(); { v: T => rowEnc1.fromRow(enc1.toRow(v)) } } def makeToInternalRowConverter: T => InternalRow = { val enc1 = enc.resolveAndBind(); { v: T => enc1.toRow(v) } } }
Example 5
Source File: ObjectExpressionsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{IntegerType, ObjectType} class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-16622: The returned value of the called method in Invoke can be null") { val inputRow = InternalRow.fromSeq(Seq((false, null))) val cls = classOf[Tuple2[Boolean, java.lang.Integer]] val inputObject = BoundReference(0, ObjectType(cls), nullable = true) val invoke = Invoke(inputObject, "_2", IntegerType) checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow) } test("MapObjects should make copies of unsafe-backed data") { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) val structExpected = new GenericArrayData( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) val arrayExpected = new GenericArrayData( Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) // test UnsafeMap-backed data val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) val mapExpected = new GenericArrayData(Seq( new ArrayBasedMapData( new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(100, 200))), new ArrayBasedMapData( new GenericArrayData(Array(3, 4)), new GenericArrayData(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } }
Example 6
Source File: EliminateSerializationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.rules.RuleExecutor case class OtherTuple(_1: Int, _2: Int) class EliminateSerializationSuite extends PlanTest { private object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Serialization", FixedPoint(100), EliminateSerialization) :: Nil } implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]() implicit private def intEncoder = ExpressionEncoder[Int]() test("back to back serialization") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) val expected = input.select('obj.as("obj")).analyze comparePlans(optimized, expected) } test("back to back serialization with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } test("back to back serialization in AppendColumns") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze val optimized = Optimize.execute(plan) val expected = AppendColumnsWithObject( func.asInstanceOf[Any => Any], productEncoder[(Int, Int)].namedExpressions, intEncoder.namedExpressions, input).analyze comparePlans(optimized, expected) } test("back to back serialization in AppendColumns with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } }
Example 7
Source File: ReduceAggregator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 8
Source File: typedaggregators.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this(x => f.call(x)) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 9
Source File: ReduceAggregatorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null)) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == (true, 1)) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == (true, 3)) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == (true, 6)) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == (true, 1)) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == (true, 3)) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == (true, 4)) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 10
Source File: TypedExpressionEncoder.scala From frameless with Apache License 2.0 | 5 votes |
package frameless import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.{BoundReference, CreateNamedStruct, If, Literal} import org.apache.spark.sql.types.StructType object TypedExpressionEncoder { def targetStructType[A](encoder: TypedEncoder[A]): StructType = { encoder.catalystRepr match { case x: StructType => if (encoder.nullable) StructType(x.fields.map(_.copy(nullable = true))) else x case dt => new StructType().add("_1", dt, nullable = encoder.nullable) } } def apply[T: TypedEncoder]: ExpressionEncoder[T] = { val encoder = TypedEncoder[T] val schema = targetStructType(encoder) val in = BoundReference(0, encoder.jvmRepr, encoder.nullable) val (out, toRowExpressions) = encoder.toCatalyst(in) match { case If(_, _, x: CreateNamedStruct) => val out = BoundReference(0, encoder.catalystRepr, encoder.nullable) (out, x.flatten) case other => val out = GetColumnByOrdinal(0, encoder.catalystRepr) (out, CreateNamedStruct(Literal("_1") :: other :: Nil).flatten) } new ExpressionEncoder[T]( schema = schema, flat = false, serializer = toRowExpressions, deserializer = encoder.fromCatalyst(out), clsTag = encoder.classTag ) } }
Example 11
Source File: TopByKeyAggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.recommendation import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.{Encoder, Encoders} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.util.BoundedPriorityQueue private[recommendation] class TopByKeyAggregator[K1: TypeTag, K2: TypeTag, V: TypeTag] (num: Int, ord: Ordering[(K2, V)]) extends Aggregator[(K1, K2, V), BoundedPriorityQueue[(K2, V)], Array[(K2, V)]] { override def zero: BoundedPriorityQueue[(K2, V)] = new BoundedPriorityQueue[(K2, V)](num)(ord) override def reduce( q: BoundedPriorityQueue[(K2, V)], a: (K1, K2, V)): BoundedPriorityQueue[(K2, V)] = { q += {(a._2, a._3)} } override def merge( q1: BoundedPriorityQueue[(K2, V)], q2: BoundedPriorityQueue[(K2, V)]): BoundedPriorityQueue[(K2, V)] = { q1 ++= q2 } override def finish(r: BoundedPriorityQueue[(K2, V)]): Array[(K2, V)] = { r.toArray.sorted(ord.reverse) } override def bufferEncoder: Encoder[BoundedPriorityQueue[(K2, V)]] = { Encoders.kryo[BoundedPriorityQueue[(K2, V)]] } override def outputEncoder: Encoder[Array[(K2, V)]] = ExpressionEncoder[Array[(K2, V)]]() }
Example 12
Source File: ObjectExpressionsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{IntegerType, ObjectType} class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-16622: The returned value of the called method in Invoke can be null") { val inputRow = InternalRow.fromSeq(Seq((false, null))) val cls = classOf[Tuple2[Boolean, java.lang.Integer]] val inputObject = BoundReference(0, ObjectType(cls), nullable = true) val invoke = Invoke(inputObject, "_2", IntegerType) checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow) } test("MapObjects should make copies of unsafe-backed data") { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) val structExpected = new GenericArrayData( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) val arrayExpected = new GenericArrayData( Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) // test UnsafeMap-backed data val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) val mapExpected = new GenericArrayData(Seq( new ArrayBasedMapData( new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(100, 200))), new ArrayBasedMapData( new GenericArrayData(Array(3, 4)), new GenericArrayData(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } }
Example 13
Source File: EliminateMapObjectsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{DeserializeToObject, LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.types._ class EliminateMapObjectsSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = { Batch("EliminateMapObjects", FixedPoint(50), NullPropagation, SimplifyCasts, EliminateMapObjects) :: Nil } } implicit private def intArrayEncoder = ExpressionEncoder[Array[Int]]() implicit private def doubleArrayEncoder = ExpressionEncoder[Array[Double]]() test("SPARK-20254: Remove unnecessary data conversion for primitive array") { val intObjType = ObjectType(classOf[Array[Int]]) val intInput = LocalRelation('a.array(ArrayType(IntegerType, false))) val intQuery = intInput.deserialize[Array[Int]].analyze val intOptimized = Optimize.execute(intQuery) val intExpected = DeserializeToObject( Invoke(intInput.output(0), "toIntArray", intObjType, Nil, true, false), AttributeReference("obj", intObjType, true)(), intInput) comparePlans(intOptimized, intExpected) val doubleObjType = ObjectType(classOf[Array[Double]]) val doubleInput = LocalRelation('a.array(ArrayType(DoubleType, false))) val doubleQuery = doubleInput.deserialize[Array[Double]].analyze val doubleOptimized = Optimize.execute(doubleQuery) val doubleExpected = DeserializeToObject( Invoke(doubleInput.output(0), "toDoubleArray", doubleObjType, Nil, true, false), AttributeReference("obj", doubleObjType, true)(), doubleInput) comparePlans(doubleOptimized, doubleExpected) } }
Example 14
Source File: EliminateSerializationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor case class OtherTuple(_1: Int, _2: Int) class EliminateSerializationSuite extends PlanTest { private object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Serialization", FixedPoint(100), EliminateSerialization) :: Nil } implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]() implicit private def intEncoder = ExpressionEncoder[Int]() test("back to back serialization") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) val expected = input.select('obj.as("obj")).analyze comparePlans(optimized, expected) } test("back to back serialization with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } test("back to back serialization in AppendColumns") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze val optimized = Optimize.execute(plan) val expected = AppendColumnsWithObject( func.asInstanceOf[Any => Any], productEncoder[(Int, Int)].namedExpressions, intEncoder.namedExpressions, input).analyze comparePlans(optimized, expected) } test("back to back serialization in AppendColumns with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } }
Example 15
Source File: ReduceAggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { @transient private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 16
Source File: typedaggregators.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 17
Source File: DataSourceV2ScanExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.datasources.v2 import scala.collection.JavaConverters._ import org.apache.spark.rdd.RDD import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.physical import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec} import org.apache.spark.sql.execution.streaming.continuous._ import org.apache.spark.sql.sources.v2.reader._ import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader import org.apache.spark.sql.types.StructType case class DataSourceV2ScanExec( output: Seq[AttributeReference], @transient reader: DataSourceReader) extends LeafExecNode with DataSourceReaderHolder with ColumnarBatchScan { override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2ScanExec] override def outputPartitioning: physical.Partitioning = reader match { case s: SupportsReportPartitioning => new DataSourcePartitioning( s.outputPartitioning(), AttributeMap(output.map(a => a -> a.name))) case _ => super.outputPartitioning } private lazy val readerFactories: java.util.List[DataReaderFactory[UnsafeRow]] = reader match { case r: SupportsScanUnsafeRow => r.createUnsafeRowReaderFactories() case _ => reader.createDataReaderFactories().asScala.map { new RowToUnsafeRowDataReaderFactory(_, reader.readSchema()): DataReaderFactory[UnsafeRow] }.asJava } private lazy val inputRDD: RDD[InternalRow] = reader match { case r: SupportsScanColumnarBatch if r.enableBatchRead() => assert(!reader.isInstanceOf[ContinuousReader], "continuous stream reader does not support columnar read yet.") new DataSourceRDD(sparkContext, r.createBatchDataReaderFactories()) .asInstanceOf[RDD[InternalRow]] case _: ContinuousReader => EpochCoordinatorRef.get( sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY), sparkContext.env) .askSync[Unit](SetReaderPartitions(readerFactories.size())) new ContinuousDataSourceRDD(sparkContext, sqlContext, readerFactories) .asInstanceOf[RDD[InternalRow]] case _ => new DataSourceRDD(sparkContext, readerFactories).asInstanceOf[RDD[InternalRow]] } override def inputRDDs(): Seq[RDD[InternalRow]] = Seq(inputRDD) override val supportsBatch: Boolean = reader match { case r: SupportsScanColumnarBatch if r.enableBatchRead() => true case _ => false } override protected def needsUnsafeRowConversion: Boolean = false override protected def doExecute(): RDD[InternalRow] = { if (supportsBatch) { WholeStageCodegenExec(this)(codegenStageId = 0).execute() } else { val numOutputRows = longMetric("numOutputRows") inputRDD.map { r => numOutputRows += 1 r } } } } class RowToUnsafeRowDataReaderFactory(rowReaderFactory: DataReaderFactory[Row], schema: StructType) extends DataReaderFactory[UnsafeRow] { override def preferredLocations: Array[String] = rowReaderFactory.preferredLocations override def createDataReader: DataReader[UnsafeRow] = { new RowToUnsafeDataReader( rowReaderFactory.createDataReader, RowEncoder.apply(schema).resolveAndBind()) } } class RowToUnsafeDataReader(val rowReader: DataReader[Row], encoder: ExpressionEncoder[Row]) extends DataReader[UnsafeRow] { override def next: Boolean = rowReader.next override def get: UnsafeRow = encoder.toRow(rowReader.get).asInstanceOf[UnsafeRow] override def close(): Unit = rowReader.close() }
Example 18
Source File: ReduceAggregatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)]) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == ((true, 1))) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == ((true, 3))) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == ((true, 6))) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == ((true, 1))) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == ((true, 3))) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == ((true, 4))) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 19
Source File: SparkAvroDecoder.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark.avro import org.apache.log4j.Logger import java.io.ByteArrayOutputStream import scala.reflect.runtime.universe._ import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord } import org.apache.avro.io.{ DecoderFactory, EncoderFactory } import org.apache.spark.sql.{ Dataset, Encoder, Row } import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder } import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.StructType import org.apache.avro.Schema import cloudflow.spark.sql.SQLImplicits._ case class EncodedKV(key: String, value: Array[Byte]) case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) { val encoder: Encoder[T] = implicitly[Encoder[T]] val sqlSchema: StructType = encoder.schema val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema) @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) @transient lazy val rowConverter = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema) @transient lazy val datumReader = new GenericDatumReader[GenericRecord](_avroSchema) @transient lazy val decoder = DecoderFactory.get def decode(bytes: Array[Byte]): Row = { val binaryDecoder = decoder.binaryDecoder(bytes, null) val record = datumReader.read(null, binaryDecoder) rowConverter(record).asInstanceOf[GenericRow] } } case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) { @transient lazy val log = Logger.getLogger(getClass.getName) val BufferSize = 5 * 1024 // 5 Kb val encoder = implicitly[Encoder[T]] val sqlSchema = encoder.schema @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) val recordName = "topLevelRecord" // ??? val recordNamespace = "recordNamespace" // ??? @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace) // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage def rowToBytes(row: Row): Array[Byte] = { val genRecord = converter(row).asInstanceOf[GenericRecord] if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord") val datumWriter = new GenericDatumWriter[GenericRecord](_avroSchema) val avroEncoder = EncoderFactory.get val byteArrOS = new ByteArrayOutputStream(BufferSize) val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null) datumWriter.write(genRecord, binaryEncoder) binaryEncoder.flush() byteArrOS.toByteArray } def encode(dataset: Dataset[T]): Dataset[Array[Byte]] = dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]] // Note to self: I'm not sure how heavy this chain of transformations is def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = { val encoder = encoderFor[T] implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind() dataset.map { value ⇒ val key = keyFun(value) val internalRow = encoder.toRow(value) val row = rowEncoder.fromRow(internalRow) val bytes = rowToBytes(row) EncodedKV(key, bytes) } } }
Example 20
Source File: SparkStreamletContextImpl.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark.kafka import java.io.File import com.typesafe.config.Config import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.{ ExpressionEncoder, RowEncoder } import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery } import cloudflow.spark.SparkStreamletContext import cloudflow.spark.avro.{ SparkAvroDecoder, SparkAvroEncoder } import cloudflow.spark.sql.SQLImplicits._ import cloudflow.streamlets._ import scala.reflect.runtime.universe._ class SparkStreamletContextImpl( private[cloudflow] override val streamletDefinition: StreamletDefinition, session: SparkSession, override val config: Config ) extends SparkStreamletContext(streamletDefinition, session) { val storageDir = config.getString("storage.mountPath") val maxOffsetsPerTrigger = config.getLong("cloudflow.spark.read.options.max-offsets-per-trigger") def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] = { implicit val inRowEncoder: ExpressionEncoder[Row] = RowEncoder(encoder.schema) val schema = inPort.schemaAsString val topic = findTopicForPort(inPort) val srcTopic = topic.name val brokers = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers) val src: DataFrame = session.readStream .format("kafka") .option("kafka.bootstrap.servers", brokers) .options(kafkaConsumerMap(topic)) .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger) .option("subscribe", srcTopic) // Allow restart of stateful streamlets that may have been offline for longer than the kafka retention period. // This setting may result in data loss in some cases but allows for continuity of the runtime .option("failOnDataLoss", false) .option("startingOffsets", "earliest") .load() val rawDataset = src.select($"value").as[Array[Byte]] val dataframe: Dataset[Row] = rawDataset.mapPartitions { iter ⇒ val avroDecoder = new SparkAvroDecoder[In](schema) iter.map(avroDecoder.decode) }(inRowEncoder) dataframe.as[In] } def kafkaConsumerMap(topic: Topic) = topic.kafkaConsumerProperties.map { case (key, value) => s"kafka.$key" -> value } def kafkaProducerMap(topic: Topic) = topic.kafkaProducerProperties.map { case (key, value) => s"kafka.$key" -> value } def writeStream[Out](stream: Dataset[Out], outPort: CodecOutlet[Out], outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = { val avroEncoder = new SparkAvroEncoder[Out](outPort.schemaAsString) val encodedStream = avroEncoder.encodeWithKey(stream, outPort.partitioner) val topic = findTopicForPort(outPort) val destTopic = topic.name val brokers = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers) // metadata checkpoint directory on mount val checkpointLocation = checkpointDir(outPort.name) val queryName = s"$streamletRef.$outPort" encodedStream.writeStream .outputMode(outputMode) .format("kafka") .queryName(queryName) .option("kafka.bootstrap.servers", brokers) .options(kafkaProducerMap(topic)) .option("topic", destTopic) .option("checkpointLocation", checkpointLocation) .start() } def checkpointDir(dirName: String): String = { val baseCheckpointDir = new File(storageDir, streamletRef) val dir = new File(baseCheckpointDir, dirName) if (!dir.exists()) { val created = dir.mkdirs() require(created, s"Could not create checkpoint directory: $dir") } dir.getAbsolutePath } }
Example 21
Source File: StandardScalerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{ScalerResult, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.StandardScaler import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class StandardScalerSuite extends SparkFeaturePFASuiteBase[ScalerResult] { implicit val enc = ExpressionEncoder[Vector]() val inputPath = "data/sample_lda_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(true) .setWithStd(true) override val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() test("StandardScaler w/o Mean and Std") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(false) .setWithStd(false) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("StandardScaler w/o Mean") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(false) .setWithStd(true) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("StandardScaler w/o Std") { val scaler = new StandardScaler() .setInputCol("features") .setOutputCol("scaled") .setWithMean(true) .setWithStd(false) val sparkTransformer = scaler.fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } }
Example 22
Source File: PCAModelSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.PCA import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class PCAModelSuite extends SparkFeaturePFASuiteBase[PCAModelResult] { implicit val enc = ExpressionEncoder[Vector]() val inputPath = "data/sample_lda_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val pca = new PCA() .setInputCol("features") .setOutputCol("pcaFeatures") .setK(3) override val sparkTransformer = pca.fit(dataset) val result = sparkTransformer.transform(dataset) override val input = withColumnAsArray(result, pca.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, pca.getOutputCol).toJSON.collect() } case class PCAModelResult(pcaFeatures: Seq[Double]) extends Result
Example 23
Source File: NormalizerSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.feature import com.ibm.aardpfark.pfa.{ScalerResult, Result, SparkFeaturePFASuiteBase} import org.apache.spark.ml.feature.Normalizer import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class NormalizerSuite extends SparkFeaturePFASuiteBase[ScalerResult] { implicit val enc = ExpressionEncoder[Vector]() val inputPath = "data/sample_lda_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val scaler = new Normalizer() .setInputCol("features") .setOutputCol("scaled") override val sparkTransformer = scaler val result = scaler.transform(dataset) override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect() override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() test("Normalizer with P = 1") { val sparkTransformer = scaler.setP(1.0) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("Normalizer with P = positive infinity"){ val sparkTransformer = scaler.setP(Double.PositiveInfinity) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } test("Normalizer with P = 3") { val sparkTransformer = scaler.setP(3.0) val result = sparkTransformer.transform(dataset) val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } }
Example 24
Source File: LinearSVCSuite.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.classification import com.ibm.aardpfark.pfa.ClassifierResult import org.apache.spark.ml.classification.LinearSVC import org.apache.spark.ml.linalg.Vector import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class LinearSVCSuite extends SparkClassifierPFASuiteBase[ClassifierResult] { val inputPath = "data/sample_libsvm_data.txt" val dataset = spark.read.format("libsvm").load(inputPath) val clf = new LinearSVC() override val sparkTransformer = clf.fit(dataset) import spark.implicits._ implicit val mapEncoder = ExpressionEncoder[Map[String, Double]]() val result = sparkTransformer.transform(dataset) override val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect() override val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol).map { case Row(p: Double, raw: Vector) => (p, raw.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol).toJSON.collect() // Additional tests test("LinearSVC w/o fitIntercept") { val sparkTransformer = clf.setFitIntercept(false).fit(dataset) val result = sparkTransformer.transform(dataset) val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol).map { case Row(p: Double, raw: Vector) => (p, raw.toArray) }.toDF(clf.getPredictionCol, clf.getRawPredictionCol).toJSON.collect() parityTest(sparkTransformer, input, expectedOutput) } }
Example 25
Source File: ReduceAggregatorSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)]) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == ((true, 1))) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == ((true, 3))) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == ((true, 6))) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == ((true, 1))) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == ((true, 3))) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == ((true, 4))) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 26
Source File: EliminateSerializationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.rules.RuleExecutor case class OtherTuple(_1: Int, _2: Int) class EliminateSerializationSuite extends PlanTest { private object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Serialization", FixedPoint(100), EliminateSerialization) :: Nil } implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]() implicit private def intEncoder = ExpressionEncoder[Int]() test("back to back serialization") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) val expected = input.select('obj.as("obj")).analyze comparePlans(optimized, expected) } test("back to back serialization with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } test("back to back serialization in AppendColumns") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze val optimized = Optimize.execute(plan) val expected = AppendColumnsWithObject( func.asInstanceOf[Any => Any], productEncoder[(Int, Int)].namedExpressions, intEncoder.namedExpressions, input).analyze comparePlans(optimized, expected) } test("back to back serialization in AppendColumns with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } }
Example 27
Source File: ReduceAggregator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 28
Source File: typedaggregators.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this(x => f.call(x)) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 29
Source File: ReduceAggregatorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null)) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == (true, 1)) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == (true, 3)) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == (true, 6)) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == (true, 1)) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == (true, 3)) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == (true, 4)) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 30
Source File: S2StreamQueryWriter.scala From incubator-s2graph with Apache License 2.0 | 5 votes |
package org.apache.s2graph.spark.sql.streaming import com.typesafe.config.ConfigFactory import org.apache.s2graph.core.{GraphElement, JSONParser} import org.apache.s2graph.s2jobs.S2GraphHelper import org.apache.s2graph.spark.sql.streaming.S2SinkConfigs._ import org.apache.spark.TaskContext import org.apache.spark.sql.Row import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} import org.apache.spark.sql.types.StructType import play.api.libs.json.{JsObject, Json} import scala.collection.mutable.ListBuffer import scala.concurrent.Await import scala.concurrent.duration.Duration import scala.util.Try private [sql] class S2StreamQueryWriter( serializedConf:String, schema: StructType , commitProtocol: S2CommitProtocol ) extends Serializable with Logger { private val config = ConfigFactory.parseString(serializedConf) private val s2Graph = S2GraphHelper.getS2Graph(config) private val encoder: ExpressionEncoder[Row] = RowEncoder(schema).resolveAndBind() private val RESERVED_COLUMN = Set("timestamp", "from", "to", "label", "operation", "elem", "direction") def run(taskContext: TaskContext, iters: Iterator[InternalRow]): TaskCommit = { val taskId = s"stage-${taskContext.stageId()}, partition-${taskContext.partitionId()}, attempt-${taskContext.taskAttemptId()}" val partitionId= taskContext.partitionId() val groupedSize = getConfigString(config, S2_SINK_GROUPED_SIZE, DEFAULT_GROUPED_SIZE).toInt val waitTime = getConfigString(config, S2_SINK_WAIT_TIME, DEFAULT_WAIT_TIME_SECONDS).toInt commitProtocol.initTask() try { var list = new ListBuffer[(String, Int)]() val rst = iters.flatMap(rowToEdge).grouped(groupedSize).flatMap{ elements => logger.debug(s"[$taskId][elements] ${elements.size} (${elements.map(e => e.toLogString).mkString(",\n")})") elements.groupBy(_.serviceName).foreach{ case (service, elems) => list += ((service, elems.size)) } val mutateF = s2Graph.mutateElements(elements, true) Await.result(mutateF, Duration(waitTime, "seconds")) } val (success, fail) = rst.toSeq.partition(r => r.isSuccess) val counter = list.groupBy(_._1).map{ case (service, t) => val sum = t.toList.map(_._2).sum (service, sum) } logger.info(s"[$taskId] success : ${success.size}, fail : ${fail.size} ($counter)") commitProtocol.commitTask(TaskState(partitionId, success.size, fail.size, counter)) } catch { case t: Throwable => commitProtocol.abortTask(TaskState(partitionId)) throw t } } private def rowToEdge(internalRow:InternalRow): Option[GraphElement] = S2GraphHelper.sparkSqlRowToGraphElement(s2Graph, encoder.fromRow(internalRow), schema, RESERVED_COLUMN) }
Example 31
Source File: GroupSortedDataset.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.sql import scala.reflect.ClassTag import org.apache.spark.sql.{ Column, Dataset, Encoder } import org.apache.spark.sql.functions.col import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder } import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate } object GroupSortedDataset { private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = { val key = col(dataset.columns.head) val valueSort = { val sort = sortBy(col(dataset.columns.last)) if (reverse) sort.desc else sort.asc } new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort)) } } class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable { def toDS: Dataset[(K, V)] = dataset def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f)) } def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIterator(_)(f)) } def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f))) } def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] = mapStreamByKey(iter => Iterator(iter.reduceLeft(f))) def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(_.scanLeft(wCreate())(f)) } }
Example 32
Source File: EliminateMapObjectsSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.AttributeReference import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical.{DeserializeToObject, LocalRelation, LogicalPlan} import org.apache.spark.sql.catalyst.rules.RuleExecutor import org.apache.spark.sql.types._ class EliminateMapObjectsSuite extends PlanTest { object Optimize extends RuleExecutor[LogicalPlan] { val batches = { Batch("EliminateMapObjects", FixedPoint(50), NullPropagation, SimplifyCasts, EliminateMapObjects) :: Nil } } implicit private def intArrayEncoder = ExpressionEncoder[Array[Int]]() implicit private def doubleArrayEncoder = ExpressionEncoder[Array[Double]]() test("SPARK-20254: Remove unnecessary data conversion for primitive array") { val intObjType = ObjectType(classOf[Array[Int]]) val intInput = LocalRelation('a.array(ArrayType(IntegerType, false))) val intQuery = intInput.deserialize[Array[Int]].analyze val intOptimized = Optimize.execute(intQuery) val intExpected = DeserializeToObject( Invoke(intInput.output(0), "toIntArray", intObjType, Nil, true, false), AttributeReference("obj", intObjType, true)(), intInput) comparePlans(intOptimized, intExpected) val doubleObjType = ObjectType(classOf[Array[Double]]) val doubleInput = LocalRelation('a.array(ArrayType(DoubleType, false))) val doubleQuery = doubleInput.deserialize[Array[Double]].analyze val doubleOptimized = Optimize.execute(doubleQuery) val doubleExpected = DeserializeToObject( Invoke(doubleInput.output(0), "toDoubleArray", doubleObjType, Nil, true, false), AttributeReference("obj", doubleObjType, true)(), doubleInput) comparePlans(doubleOptimized, doubleExpected) } }
Example 33
Source File: EliminateSerializationSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.rules.RuleExecutor case class OtherTuple(_1: Int, _2: Int) class EliminateSerializationSuite extends PlanTest { private object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Serialization", FixedPoint(100), EliminateSerialization) :: Nil } implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]() implicit private def intEncoder = ExpressionEncoder[Int]() test("back to back serialization") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) val expected = input.select('obj.as("obj")).analyze comparePlans(optimized, expected) } test("back to back serialization with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } test("back to back serialization in AppendColumns") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze val optimized = Optimize.execute(plan) val expected = AppendColumnsWithObject( func.asInstanceOf[Any => Any], productEncoder[(Int, Int)].namedExpressions, intEncoder.namedExpressions, input).analyze comparePlans(optimized, expected) } test("back to back serialization in AppendColumns with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } }
Example 34
Source File: ReduceAggregator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { @transient private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 35
Source File: typedaggregators.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 36
Source File: ForeachBatchSink.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming.sources import org.apache.spark.api.python.PythonException import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.execution.streaming.Sink import org.apache.spark.sql.streaming.DataStreamWriter class ForeachBatchSink[T](batchWriter: (Dataset[T], Long) => Unit, encoder: ExpressionEncoder[T]) extends Sink { override def addBatch(batchId: Long, data: DataFrame): Unit = { val resolvedEncoder = encoder.resolveAndBind( data.logicalPlan.output, data.sparkSession.sessionState.analyzer) val rdd = data.queryExecution.toRdd.map[T](resolvedEncoder.fromRow)(encoder.clsTag) val ds = data.sparkSession.createDataset(rdd)(encoder) batchWriter(ds, batchId) } override def toString(): String = "ForeachBatchSink" } def call(batchDF: DataFrame, batchId: Long): Unit } object PythonForeachBatchHelper { def callForeachBatch(dsw: DataStreamWriter[Row], pythonFunc: PythonForeachBatchFunction): Unit = { dsw.foreachBatch(pythonFunc.call _) } }
Example 37
Source File: ObjectExpressionsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{IntegerType, ObjectType} class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-16622: The returned value of the called method in Invoke can be null") { val inputRow = InternalRow.fromSeq(Seq((false, null))) val cls = classOf[Tuple2[Boolean, java.lang.Integer]] val inputObject = BoundReference(0, ObjectType(cls), nullable = true) val invoke = Invoke(inputObject, "_2", IntegerType) checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow) } test("MapObjects should make copies of unsafe-backed data") { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) val structExpected = new GenericArrayData( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) val arrayExpected = new GenericArrayData( Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) // test UnsafeMap-backed data val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) val mapExpected = new GenericArrayData(Seq( new ArrayBasedMapData( new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(100, 200))), new ArrayBasedMapData( new GenericArrayData(Array(3, 4)), new GenericArrayData(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } }
Example 38
Source File: Encoders.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.encoders import magellan._ import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ import scala.reflect._ object Encoders { implicit def encoderForPoint: Encoder[Point] = { val sqlType = new PointUDT().sqlType ExpressionEncoder[Point]( schema = sqlType, flat = true, serializer = Seq( MagellanSerializer( BoundReference(0, ObjectType(classOf[Point]), nullable = true), sqlType)), deserializer = MagellanDeserializer( GetColumnByOrdinal(0, sqlType), classOf[Point]), clsTag = classTag[Point]) } implicit def encoderForPolygon: Encoder[Polygon] = { val sqlType = new PolygonUDT().sqlType ExpressionEncoder[Polygon]( schema = sqlType, flat = true, serializer = Seq( MagellanSerializer( BoundReference(0, ObjectType(classOf[Polygon]), nullable = true), sqlType)), deserializer = MagellanDeserializer( GetColumnByOrdinal(0, sqlType), classOf[Polygon]), clsTag = classTag[Polygon]) } implicit def encoderForPolyLine: Encoder[PolyLine] = { val sqlType = new PolyLineUDT().sqlType ExpressionEncoder[PolyLine]( schema = sqlType, flat = true, serializer = Seq( MagellanSerializer( BoundReference(0, ObjectType(classOf[PolyLine]), nullable = true), sqlType)), deserializer = MagellanDeserializer( GetColumnByOrdinal(0, sqlType), classOf[PolyLine]), clsTag = classTag[PolyLine]) } }
Example 39
Source File: ArangoSparkSSLReadTest.scala From arangodb-spark-connector with Apache License 2.0 | 5 votes |
package com.arangodb.spark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.scalatest.BeforeAndAfterAll import org.scalatest.BeforeAndAfterEach import org.scalatest.FunSuite import org.scalatest.Matchers import collection.JavaConverters._ import com.arangodb.ArangoDB import com.arangodb.ArangoDBException import com.arangodb.velocypack.VPackBuilder import com.arangodb.velocypack.ValueType import scala.reflect.ClassTag import com.arangodb.spark.rdd.partition.ArangoPartitionerSinglePartition import org.scalatest.Ignore @Ignore class ArangoSparkSSLReadTest extends FunSuite with Matchers with BeforeAndAfterAll with BeforeAndAfterEach with SharedSparkContextSSL { val DB = "spark_test_db" val COLLECTION = "spark_test_col" val arangoDB = new ArangoDB.Builder().build() override def beforeAll() { super.beforeAll() try { arangoDB.db(DB).drop() } catch { case e: ArangoDBException => } arangoDB.createDatabase(DB) arangoDB.db(DB).createCollection(COLLECTION) val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) }) ArangoSpark.save(documents, COLLECTION, WriteOptions(DB)) } override def afterAll() { try { arangoDB.db(DB).drop() arangoDB.shutdown() } finally { super.afterAll() } } test("load all documents from collection") { val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB)) rdd.count() should be(100) } test("load with custom partionier") { val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB, partitioner = new ArangoPartitionerSinglePartition())) rdd.count() should be(100) } test("load documents from collection with filter statement") { val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB)) val rdd2 = rdd.filter("doc.test <= 50") rdd2.count() should be(50) } }
Example 40
Source File: ArangoSparkSSLWriteTest.scala From arangodb-spark-connector with Apache License 2.0 | 5 votes |
package com.arangodb.spark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.scalatest.BeforeAndAfterAll import org.scalatest.BeforeAndAfterEach import org.scalatest.FunSuite import org.scalatest.Matchers import com.arangodb.ArangoDB import com.arangodb.ArangoDBException import com.arangodb.velocypack.VPackBuilder import com.arangodb.velocypack.ValueType import org.scalatest.Ignore @Ignore class ArangoSparkSSLWriteTest extends FunSuite with Matchers with BeforeAndAfterAll with BeforeAndAfterEach with SharedSparkContextSSL { val DB = "spark_test_db" val COLLECTION = "spark_test_col" val arangoDB = new ArangoDB.Builder().build() override def beforeAll() { super.beforeAll() try { arangoDB.db(DB).drop() } catch { case e: ArangoDBException => } arangoDB.createDatabase(DB) arangoDB.db(DB).createCollection(COLLECTION) } override def afterAll() { try { arangoDB.db(DB).drop() arangoDB.shutdown() } finally { super.afterAll() } } override def afterEach() { arangoDB.db(DB).collection(COLLECTION).truncate() } private def checkDocumentCount(count: Int) { arangoDB.db(DB).collection(COLLECTION).count().getCount should be(count) } test("save RDD to ArangoDB") { checkDocumentCount(0) val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) }) ArangoSpark.save(documents, COLLECTION, WriteOptions(DB)) checkDocumentCount(100) } test("save RDD[VPackSlice] to ArangoDB") { checkDocumentCount(0) val documents = sc.parallelize((1 to 100).map { i => new VPackBuilder().add(ValueType.OBJECT).add("test", Integer.valueOf(i)).close().slice() }) ArangoSpark.save(documents, COLLECTION, WriteOptions(DB)) checkDocumentCount(100) } test("save DataFrame to ArangoDB") { checkDocumentCount(0) val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) }) val sql: SQLContext = SQLContext.getOrCreate(sc); val df = sql.createDataFrame(documents, classOf[TestEntity]) ArangoSpark.saveDF(df, COLLECTION, WriteOptions(DB)) checkDocumentCount(100) } test("save Dataset to ArangoDB") { checkDocumentCount(0) val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) }) val sql: SQLContext = SQLContext.getOrCreate(sc); val encoder = ExpressionEncoder.javaBean(classOf[TestEntity]) val ds = sql.createDataset(documents)(encoder); ArangoSpark.save(ds, COLLECTION, WriteOptions(DB)) checkDocumentCount(100) } }
Example 41
Source File: ArangoSparkReadTest.scala From arangodb-spark-connector with Apache License 2.0 | 5 votes |
package com.arangodb.spark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.scalatest.BeforeAndAfterAll import org.scalatest.BeforeAndAfterEach import org.scalatest.FunSuite import org.scalatest.Matchers import collection.JavaConverters._ import com.arangodb.ArangoDB import com.arangodb.ArangoDBException import com.arangodb.velocypack.VPackBuilder import com.arangodb.velocypack.ValueType import scala.reflect.ClassTag import com.arangodb.spark.rdd.partition.ArangoPartitionerSinglePartition import org.scalatest.Ignore import com.arangodb.entity.LoadBalancingStrategy class ArangoSparkReadTest extends FunSuite with Matchers with BeforeAndAfterAll with BeforeAndAfterEach with SharedSparkContext { val DB = "spark_test_db" val COLLECTION = "spark_test_col" val arangoDB = new ArangoDB.Builder().build() override def beforeAll() { super.beforeAll() try { arangoDB.db(DB).drop() } catch { case e: ArangoDBException => } arangoDB.createDatabase(DB) arangoDB.db(DB).createCollection(COLLECTION) val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) }) ArangoSpark.save(documents, COLLECTION, WriteOptions(DB)) } override def afterAll() { try { arangoDB.db(DB).drop() arangoDB.shutdown() } finally { super.afterAll() } } test("load all documents from collection") { val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB)) rdd.count() should be(100) } test("load with custom partionier") { val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB, partitioner = new ArangoPartitionerSinglePartition())) rdd.count() should be(100) } test("load documents from collection with filter statement") { val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB)) val rdd2 = rdd.filter("doc.test <= 50") rdd2.count() should be(50) } test("load all documents from collection with load balancing") { val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB).acquireHostList(false).loadBalancingStrategy(LoadBalancingStrategy.ROUND_ROBIN)) rdd.count() should be(100) } }
Example 42
Source File: ObjectExpressionsSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions.objects.Invoke import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData} import org.apache.spark.sql.types.{IntegerType, ObjectType} class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper { test("SPARK-16622: The returned value of the called method in Invoke can be null") { val inputRow = InternalRow.fromSeq(Seq((false, null))) val cls = classOf[Tuple2[Boolean, java.lang.Integer]] val inputObject = BoundReference(0, ObjectType(cls), nullable = true) val invoke = Invoke(inputObject, "_2", IntegerType) checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow) } test("MapObjects should make copies of unsafe-backed data") { // test UnsafeRow-backed data val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]] val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4)))) val structExpected = new GenericArrayData( Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4)))) checkEvalutionWithUnsafeProjection( structEncoder.serializer.head, structExpected, structInputRow) // test UnsafeArray-backed data val arrayEncoder = ExpressionEncoder[Array[Array[Int]]] val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4)))) val arrayExpected = new GenericArrayData( Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4)))) checkEvalutionWithUnsafeProjection( arrayEncoder.serializer.head, arrayExpected, arrayInputRow) // test UnsafeMap-backed data val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]] val mapInputRow = InternalRow.fromSeq(Seq(Array( Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400)))) val mapExpected = new GenericArrayData(Seq( new ArrayBasedMapData( new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(100, 200))), new ArrayBasedMapData( new GenericArrayData(Array(3, 4)), new GenericArrayData(Array(300, 400))))) checkEvalutionWithUnsafeProjection( mapEncoder.serializer.head, mapExpected, mapInputRow) } }
Example 43
Source File: EliminateSerializationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.catalyst.optimizer import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.catalyst.dsl.expressions._ import org.apache.spark.sql.catalyst.dsl.plans._ import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.PlanTest import org.apache.spark.sql.catalyst.rules.RuleExecutor case class OtherTuple(_1: Int, _2: Int) class EliminateSerializationSuite extends PlanTest { private object Optimize extends RuleExecutor[LogicalPlan] { val batches = Batch("Serialization", FixedPoint(100), EliminateSerialization) :: Nil } implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]() implicit private def intEncoder = ExpressionEncoder[Int]() test("back to back serialization") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) val expected = input.select('obj.as("obj")).analyze comparePlans(optimized, expected) } test("back to back serialization with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } test("back to back serialization in AppendColumns") { val input = LocalRelation('obj.obj(classOf[(Int, Int)])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze val optimized = Optimize.execute(plan) val expected = AppendColumnsWithObject( func.asInstanceOf[Any => Any], productEncoder[(Int, Int)].namedExpressions, intEncoder.namedExpressions, input).analyze comparePlans(optimized, expected) } test("back to back serialization in AppendColumns with object change") { val input = LocalRelation('obj.obj(classOf[OtherTuple])) val func = (item: (Int, Int)) => item._1 val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze val optimized = Optimize.execute(plan) comparePlans(optimized, plan) } }
Example 44
Source File: ReduceAggregator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 45
Source File: typedaggregators.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this(x => f.call(x)) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 46
Source File: ReduceAggregatorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.SparkFunSuite import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder class ReduceAggregatorSuite extends SparkFunSuite { test("zero value") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) assert(aggregator.zero == (false, null)) } test("reduce, merge and finish") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) val firstReduce = aggregator.reduce(aggregator.zero, 1) assert(firstReduce == (true, 1)) val secondReduce = aggregator.reduce(firstReduce, 2) assert(secondReduce == (true, 3)) val thirdReduce = aggregator.reduce(secondReduce, 3) assert(thirdReduce == (true, 6)) val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce) assert(mergeWithZero1 == (true, 1)) val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero) assert(mergeWithZero2 == (true, 3)) val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce) assert(mergeTwoReduced == (true, 4)) assert(aggregator.finish(firstReduce)== 1) assert(aggregator.finish(secondReduce) == 3) assert(aggregator.finish(thirdReduce) == 6) assert(aggregator.finish(mergeWithZero1) == 1) assert(aggregator.finish(mergeWithZero2) == 3) assert(aggregator.finish(mergeTwoReduced) == 4) } test("requires at least one input row") { val encoder: ExpressionEncoder[Int] = ExpressionEncoder() val func = (v1: Int, v2: Int) => v1 + v2 val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt) intercept[IllegalStateException] { aggregator.finish(aggregator.zero) } } }
Example 47
Source File: RowStreamParserImp.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.streaming.parser import java.text.SimpleDateFormat import java.util import org.apache.hadoop.conf.Configuration import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.processing.loading.ComplexDelimitersEnum import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants class RowStreamParserImp extends CarbonStreamParser { var configuration: Configuration = null var isVarcharTypeMapping: Array[Boolean] = null var structType: StructType = null var encoder: ExpressionEncoder[Row] = null var timeStampFormat: SimpleDateFormat = null var dateFormat: SimpleDateFormat = null var complexDelimiters: util.ArrayList[String] = new util.ArrayList[String]() var serializationNullFormat: String = null override def initialize(configuration: Configuration, structType: StructType, isVarcharTypeMapping: Array[Boolean]): Unit = { this.configuration = configuration this.structType = structType this.encoder = RowEncoder.apply(this.structType).resolveAndBind() this.isVarcharTypeMapping = isVarcharTypeMapping this.timeStampFormat = new SimpleDateFormat( this.configuration.get(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT)) this.dateFormat = new SimpleDateFormat( this.configuration.get(CarbonCommonConstants.CARBON_DATE_FORMAT)) this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_1")) this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_2")) this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_3")) this.complexDelimiters.add(ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_4.value()) this.serializationNullFormat = this.configuration.get(DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT) } override def parserRow(value: InternalRow): Array[Object] = { this.encoder.fromRow(value).toSeq.zipWithIndex.map { case (x, i) => FieldConverter.objectToString( x, serializationNullFormat, complexDelimiters, timeStampFormat, dateFormat, isVarcharType = i < this.isVarcharTypeMapping.length && this.isVarcharTypeMapping(i), binaryCodec = null) } }.toArray override def close(): Unit = { } }