org.apache.spark.sql.catalyst.encoders.ExpressionEncoder Scala Examples

The following examples show how to use org.apache.spark.sql.catalyst.encoders.ExpressionEncoder. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkPFASuiteBase.scala    From aardpfark   with Apache License 2.0 6 votes vote down vote up
package com.ibm.aardpfark.pfa

import com.holdenkarau.spark.testing.DataFrameSuiteBase
import org.apache.spark.SparkConf
import org.apache.spark.ml.Transformer
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.functions.udf
import org.apache.spark.sql.{DataFrame, Row, SparkSession}
import org.scalactic.Equality
import org.scalatest.FunSuite

abstract class SparkPFASuiteBase extends FunSuite with DataFrameSuiteBase with PFATestUtils {

  val sparkTransformer: Transformer
  val input: Array[String]
  val expectedOutput: Array[String]

  val sparkConf =  new SparkConf().
    setMaster("local[*]").
    setAppName("test").
    set("spark.ui.enabled", "false").
    set("spark.app.id", appID).
    set("spark.driver.host", "localhost")
  override lazy val spark = SparkSession.builder().config(sparkConf).getOrCreate()
  override val reuseContextIfPossible = true

  // Converts column containing a vector to an array
  def withColumnAsArray(df: DataFrame, colName: String) = {
    val vecToArray = udf { v: Vector => v.toArray }
    df.withColumn(colName, vecToArray(df(colName)))
  }

  def withColumnAsArray(df: DataFrame, first: String, others: String*) = {
    val vecToArray = udf { v: Vector => v.toArray }
    var result = df.withColumn(first, vecToArray(df(first)))
    others.foreach(c => result = result.withColumn(c, vecToArray(df(c))))
    result
  }

  // Converts column containing a vector to a sparse vector represented as a map
  def getColumnAsSparseVectorMap(df: DataFrame, colName: String) = {
    val vecToMap = udf { v: Vector => v.toSparse.indices.map(i => (i.toString, v(i))).toMap }
    df.withColumn(colName, vecToMap(df(colName)))
  }

}

abstract class Result

object ApproxEquality extends ApproxEquality

trait ApproxEquality {

  import org.scalactic.Tolerance._
  import org.scalactic.TripleEquals._

  implicit val seqApproxEq: Equality[Seq[Double]] = new Equality[Seq[Double]] {
    override def areEqual(a: Seq[Double], b: Any): Boolean = {
      b match {
        case d: Seq[Double] =>
          a.zip(d).forall { case (l, r) => l === r +- 0.001 }
        case _ =>
          false
      }
    }
  }

  implicit val vectorApproxEq: Equality[Vector] = new Equality[Vector] {
    override def areEqual(a: Vector, b: Any): Boolean = {
      b match {
        case v: Vector =>
          a.toArray.zip(v.toArray).forall { case (l, r) => l === r +- 0.001 }
        case _ =>
          false
      }
    }
  }
} 
Example 2
Source File: DescribeDeltaHistoryCommand.scala    From delta   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.delta.commands

// scalastyle:off import.ordering.noEmptyLine
import org.apache.spark.sql.delta.{DeltaErrors, DeltaLog, DeltaTableIdentifier}
import org.apache.spark.sql.delta.actions.CommitInfo
import org.apache.spark.sql.delta.metering.DeltaLogging
import org.apache.hadoop.fs.Path

import org.apache.spark.sql.{Row, SparkSession}
import org.apache.spark.sql.catalyst.TableIdentifier
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.catalog.CatalogTableType
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LeafNode, LogicalPlan, Statistics}
import org.apache.spark.sql.execution.command.RunnableCommand


case class DescribeDeltaHistoryCommand(
    path: Option[String],
    tableIdentifier: Option[TableIdentifier],
    limit: Option[Int],
    override val output: Seq[Attribute] = ExpressionEncoder[CommitInfo]().schema.toAttributes)
  extends RunnableCommand with DeltaLogging {

  override def run(sparkSession: SparkSession): Seq[Row] = {
    val basePath =
      if (path.nonEmpty) {
        new Path(path.get)
      } else if (tableIdentifier.nonEmpty) {
        val sessionCatalog = sparkSession.sessionState.catalog
        lazy val metadata = sessionCatalog.getTableMetadata(tableIdentifier.get)

        DeltaTableIdentifier(sparkSession, tableIdentifier.get) match {
          case Some(id) if id.path.nonEmpty =>
            new Path(id.path.get)
          case Some(id) if id.table.nonEmpty =>
            new Path(metadata.location)
          case _ =>
            if (metadata.tableType == CatalogTableType.VIEW) {
              throw DeltaErrors.describeViewHistory
            }
            throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY")
        }
      } else {
        throw DeltaErrors.missingTableIdentifierException("DESCRIBE HISTORY")
      }

    // Max array size
    if (limit.exists(_ > Int.MaxValue - 8)) {
      throw new IllegalArgumentException("Please use a limit less than Int.MaxValue - 8.")
    }

    val deltaLog = DeltaLog.forTable(sparkSession, basePath)
    recordDeltaOperation(deltaLog, "delta.ddl.describeHistory") {
      if (deltaLog.snapshot.version == -1) {
        throw DeltaErrors.notADeltaTableException("DESCRIBE HISTORY")
      }

      import sparkSession.implicits._
      deltaLog.history.getHistory(limit).toDF().collect().toSeq
    }
  }
} 
Example 3
Source File: implicits.scala    From spark-dynamodb   with Apache License 2.0 5 votes vote down vote up
package com.audienceproject.spark.dynamodb

import com.audienceproject.spark.dynamodb.reflect.SchemaAnalysis
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.types.StructField

import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

object implicits {

    implicit class DynamoDBDataFrameReader(reader: DataFrameReader) {

        def dynamodb(tableName: String): DataFrame =
            getDynamoDBSource(tableName).load()

        def dynamodb(tableName: String, indexName: String): DataFrame =
            getDynamoDBSource(tableName).option("indexName", indexName).load()

        def dynamodbAs[T <: Product : ClassTag : TypeTag](tableName: String): Dataset[T] = {
            implicit val encoder: Encoder[T] = ExpressionEncoder()
            getColumnsAlias(getDynamoDBSource(tableName)
                .schema(SchemaAnalysis[T]).load()).as
        }

        def dynamodbAs[T <: Product : ClassTag : TypeTag](tableName: String, indexName: String): Dataset[T] = {
            implicit val encoder: Encoder[T] = ExpressionEncoder()
            getColumnsAlias(getDynamoDBSource(tableName)
                .option("indexName", indexName)
                .schema(SchemaAnalysis[T]).load()).as
        }

        private def getDynamoDBSource(tableName: String): DataFrameReader =
            reader.format("com.audienceproject.spark.dynamodb.datasource").option("tableName", tableName)

        private def getColumnsAlias(dataFrame: DataFrame): DataFrame = {
            val columnsAlias = dataFrame.schema.collect({
                case StructField(name, _, _, metadata) if metadata.contains("alias") =>
                    col(name).as(metadata.getString("alias"))
                case StructField(name, _, _, _) =>
                    col(name)
            })
            dataFrame.select(columnsAlias: _*)
        }

    }

    implicit class DynamoDBDataFrameWriter[T](writer: DataFrameWriter[T]) {

        def dynamodb(tableName: String): Unit =
            writer.format("com.audienceproject.spark.dynamodb.datasource").option("tableName", tableName).save()

    }

} 
Example 4
Source File: SparkBindings.scala    From mmlspark   with MIT License 5 votes vote down vote up
// Copyright (C) Microsoft Corporation. All rights reserved.
// Licensed under the MIT License. See LICENSE in project root for information.

package com.microsoft.ml.spark.core.schema

import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.types.StructType

import scala.reflect.runtime.universe.TypeTag

abstract class SparkBindings[T: TypeTag] extends Serializable {

  lazy val schema: StructType = enc.schema
  private lazy val enc: ExpressionEncoder[T] = ExpressionEncoder[T]().resolveAndBind()
  private lazy val rowEnc: ExpressionEncoder[Row] = RowEncoder(enc.schema).resolveAndBind()

  // WARNING: each time you use this function on a dataframe, you should make a new converter.
  // Spark does some magic that makes this leak memory if re-used on a
  // different symbolic node of the parallel computation. That being said,
  // you should make a single converter before using it in a udf so
  // that the slow resolving and binding is not in the hotpath
  def makeFromRowConverter: Row => T = {
    val enc1 = enc.resolveAndBind()
    val rowEnc1 = rowEnc.resolveAndBind();
    { r: Row => enc1.fromRow(rowEnc1.toRow(r)) }
  }

  def makeFromInternalRowConverter: InternalRow => T = {
    val enc1 = enc.resolveAndBind();
    { r: InternalRow => enc1.fromRow(r) }
  }

  def makeToRowConverter: T => Row = {
    val enc1 = enc.resolveAndBind()
    val rowEnc1 = rowEnc.resolveAndBind();
    { v: T => rowEnc1.fromRow(enc1.toRow(v)) }
  }

  def makeToInternalRowConverter: T => InternalRow = {
    val enc1 = enc.resolveAndBind();
    { v: T => enc1.toRow(v) }
  }

} 
Example 5
Source File: ObjectExpressionsSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.objects.Invoke
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{IntegerType, ObjectType}


class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("SPARK-16622: The returned value of the called method in Invoke can be null") {
    val inputRow = InternalRow.fromSeq(Seq((false, null)))
    val cls = classOf[Tuple2[Boolean, java.lang.Integer]]
    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
    val invoke = Invoke(inputObject, "_2", IntegerType)
    checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow)
  }

  test("MapObjects should make copies of unsafe-backed data") {
    // test UnsafeRow-backed data
    val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]]
    val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4))))
    val structExpected = new GenericArrayData(
      Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4))))
    checkEvalutionWithUnsafeProjection(
      structEncoder.serializer.head, structExpected, structInputRow)

    // test UnsafeArray-backed data
    val arrayEncoder = ExpressionEncoder[Array[Array[Int]]]
    val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4))))
    val arrayExpected = new GenericArrayData(
      Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4))))
    checkEvalutionWithUnsafeProjection(
      arrayEncoder.serializer.head, arrayExpected, arrayInputRow)

    // test UnsafeMap-backed data
    val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]]
    val mapInputRow = InternalRow.fromSeq(Seq(Array(
      Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400))))
    val mapExpected = new GenericArrayData(Seq(
      new ArrayBasedMapData(
        new GenericArrayData(Array(1, 2)),
        new GenericArrayData(Array(100, 200))),
      new ArrayBasedMapData(
        new GenericArrayData(Array(3, 4)),
        new GenericArrayData(Array(300, 400)))))
    checkEvalutionWithUnsafeProjection(
      mapEncoder.serializer.head, mapExpected, mapInputRow)
  }
} 
Example 6
Source File: EliminateSerializationSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.rules.RuleExecutor

case class OtherTuple(_1: Int, _2: Int)

class EliminateSerializationSuite extends PlanTest {
  private object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("Serialization", FixedPoint(100),
        EliminateSerialization) :: Nil
  }

  implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]()
  implicit private def intEncoder = ExpressionEncoder[Int]()

  test("back to back serialization") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    val expected = input.select('obj.as("obj")).analyze
    comparePlans(optimized, expected)
  }

  test("back to back serialization with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }

  test("back to back serialization in AppendColumns") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze

    val optimized = Optimize.execute(plan)

    val expected = AppendColumnsWithObject(
      func.asInstanceOf[Any => Any],
      productEncoder[(Int, Int)].namedExpressions,
      intEncoder.namedExpressions,
      input).analyze

    comparePlans(optimized, expected)
  }

  test("back to back serialization in AppendColumns with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze

    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }
} 
Example 7
Source File: ReduceAggregator.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 8
Source File: typedaggregators.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this(x => f.call(x))
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 9
Source File: ReduceAggregatorSuite.scala    From multi-tenancy-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null))
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == (true, 1))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == (true, 3))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == (true, 6))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == (true, 1))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == (true, 3))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == (true, 4))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 10
Source File: TypedExpressionEncoder.scala    From frameless   with Apache License 2.0 5 votes vote down vote up
package frameless

import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.{BoundReference, CreateNamedStruct, If, Literal}
import org.apache.spark.sql.types.StructType

object TypedExpressionEncoder {

  
  def targetStructType[A](encoder: TypedEncoder[A]): StructType = {
   encoder.catalystRepr match {
      case x: StructType =>
        if (encoder.nullable) StructType(x.fields.map(_.copy(nullable = true)))
        else x
      case dt => new StructType().add("_1", dt, nullable = encoder.nullable)
    }
  }

  def apply[T: TypedEncoder]: ExpressionEncoder[T] = {
    val encoder = TypedEncoder[T]
    val schema = targetStructType(encoder)

    val in = BoundReference(0, encoder.jvmRepr, encoder.nullable)

    val (out, toRowExpressions) = encoder.toCatalyst(in) match {
      case If(_, _, x: CreateNamedStruct) =>
        val out = BoundReference(0, encoder.catalystRepr, encoder.nullable)

        (out, x.flatten)
      case other =>
        val out = GetColumnByOrdinal(0, encoder.catalystRepr)

        (out, CreateNamedStruct(Literal("_1") :: other :: Nil).flatten)
    }

    new ExpressionEncoder[T](
      schema = schema,
      flat = false,
      serializer = toRowExpressions,
      deserializer = encoder.fromCatalyst(out),
      clsTag = encoder.classTag
    )
  }
} 
Example 11
Source File: TopByKeyAggregator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.ml.recommendation

import scala.language.implicitConversions
import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.{Encoder, Encoders}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator
import org.apache.spark.util.BoundedPriorityQueue



private[recommendation] class TopByKeyAggregator[K1: TypeTag, K2: TypeTag, V: TypeTag]
  (num: Int, ord: Ordering[(K2, V)])
  extends Aggregator[(K1, K2, V), BoundedPriorityQueue[(K2, V)], Array[(K2, V)]] {

  override def zero: BoundedPriorityQueue[(K2, V)] = new BoundedPriorityQueue[(K2, V)](num)(ord)

  override def reduce(
      q: BoundedPriorityQueue[(K2, V)],
      a: (K1, K2, V)): BoundedPriorityQueue[(K2, V)] = {
    q += {(a._2, a._3)}
  }

  override def merge(
      q1: BoundedPriorityQueue[(K2, V)],
      q2: BoundedPriorityQueue[(K2, V)]): BoundedPriorityQueue[(K2, V)] = {
    q1 ++= q2
  }

  override def finish(r: BoundedPriorityQueue[(K2, V)]): Array[(K2, V)] = {
    r.toArray.sorted(ord.reverse)
  }

  override def bufferEncoder: Encoder[BoundedPriorityQueue[(K2, V)]] = {
    Encoders.kryo[BoundedPriorityQueue[(K2, V)]]
  }

  override def outputEncoder: Encoder[Array[(K2, V)]] = ExpressionEncoder[Array[(K2, V)]]()
} 
Example 12
Source File: ObjectExpressionsSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.objects.Invoke
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{IntegerType, ObjectType}


class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("SPARK-16622: The returned value of the called method in Invoke can be null") {
    val inputRow = InternalRow.fromSeq(Seq((false, null)))
    val cls = classOf[Tuple2[Boolean, java.lang.Integer]]
    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
    val invoke = Invoke(inputObject, "_2", IntegerType)
    checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow)
  }

  test("MapObjects should make copies of unsafe-backed data") {
    // test UnsafeRow-backed data
    val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]]
    val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4))))
    val structExpected = new GenericArrayData(
      Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4))))
    checkEvalutionWithUnsafeProjection(
      structEncoder.serializer.head, structExpected, structInputRow)

    // test UnsafeArray-backed data
    val arrayEncoder = ExpressionEncoder[Array[Array[Int]]]
    val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4))))
    val arrayExpected = new GenericArrayData(
      Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4))))
    checkEvalutionWithUnsafeProjection(
      arrayEncoder.serializer.head, arrayExpected, arrayInputRow)

    // test UnsafeMap-backed data
    val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]]
    val mapInputRow = InternalRow.fromSeq(Seq(Array(
      Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400))))
    val mapExpected = new GenericArrayData(Seq(
      new ArrayBasedMapData(
        new GenericArrayData(Array(1, 2)),
        new GenericArrayData(Array(100, 200))),
      new ArrayBasedMapData(
        new GenericArrayData(Array(3, 4)),
        new GenericArrayData(Array(300, 400)))))
    checkEvalutionWithUnsafeProjection(
      mapEncoder.serializer.head, mapExpected, mapInputRow)
  }
} 
Example 13
Source File: EliminateMapObjectsSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.AttributeReference
import org.apache.spark.sql.catalyst.expressions.objects.Invoke
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{DeserializeToObject, LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.types._

class EliminateMapObjectsSuite extends PlanTest {
  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = {
      Batch("EliminateMapObjects", FixedPoint(50),
        NullPropagation,
        SimplifyCasts,
        EliminateMapObjects) :: Nil
    }
  }

  implicit private def intArrayEncoder = ExpressionEncoder[Array[Int]]()
  implicit private def doubleArrayEncoder = ExpressionEncoder[Array[Double]]()

  test("SPARK-20254: Remove unnecessary data conversion for primitive array") {
    val intObjType = ObjectType(classOf[Array[Int]])
    val intInput = LocalRelation('a.array(ArrayType(IntegerType, false)))
    val intQuery = intInput.deserialize[Array[Int]].analyze
    val intOptimized = Optimize.execute(intQuery)
    val intExpected = DeserializeToObject(
      Invoke(intInput.output(0), "toIntArray", intObjType, Nil, true, false),
      AttributeReference("obj", intObjType, true)(), intInput)
    comparePlans(intOptimized, intExpected)

    val doubleObjType = ObjectType(classOf[Array[Double]])
    val doubleInput = LocalRelation('a.array(ArrayType(DoubleType, false)))
    val doubleQuery = doubleInput.deserialize[Array[Double]].analyze
    val doubleOptimized = Optimize.execute(doubleQuery)
    val doubleExpected = DeserializeToObject(
      Invoke(doubleInput.output(0), "toDoubleArray", doubleObjType, Nil, true, false),
      AttributeReference("obj", doubleObjType, true)(), doubleInput)
    comparePlans(doubleOptimized, doubleExpected)
  }
} 
Example 14
Source File: EliminateSerializationSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules.RuleExecutor

case class OtherTuple(_1: Int, _2: Int)

class EliminateSerializationSuite extends PlanTest {
  private object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("Serialization", FixedPoint(100),
        EliminateSerialization) :: Nil
  }

  implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]()
  implicit private def intEncoder = ExpressionEncoder[Int]()

  test("back to back serialization") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    val expected = input.select('obj.as("obj")).analyze
    comparePlans(optimized, expected)
  }

  test("back to back serialization with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }

  test("back to back serialization in AppendColumns") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze

    val optimized = Optimize.execute(plan)

    val expected = AppendColumnsWithObject(
      func.asInstanceOf[Any => Any],
      productEncoder[(Int, Int)].namedExpressions,
      intEncoder.namedExpressions,
      input).analyze

    comparePlans(optimized, expected)
  }

  test("back to back serialization in AppendColumns with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze

    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }
} 
Example 15
Source File: ReduceAggregator.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  @transient private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 16
Source File: typedaggregators.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any])
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 17
Source File: DataSourceV2ScanExec.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.datasources.v2

import scala.collection.JavaConverters._

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.catalyst.plans.physical
import org.apache.spark.sql.execution.{ColumnarBatchScan, LeafExecNode, WholeStageCodegenExec}
import org.apache.spark.sql.execution.streaming.continuous._
import org.apache.spark.sql.sources.v2.reader._
import org.apache.spark.sql.sources.v2.reader.streaming.ContinuousReader
import org.apache.spark.sql.types.StructType


case class DataSourceV2ScanExec(
    output: Seq[AttributeReference],
    @transient reader: DataSourceReader)
  extends LeafExecNode with DataSourceReaderHolder with ColumnarBatchScan {

  override def canEqual(other: Any): Boolean = other.isInstanceOf[DataSourceV2ScanExec]

  override def outputPartitioning: physical.Partitioning = reader match {
    case s: SupportsReportPartitioning =>
      new DataSourcePartitioning(
        s.outputPartitioning(), AttributeMap(output.map(a => a -> a.name)))

    case _ => super.outputPartitioning
  }

  private lazy val readerFactories: java.util.List[DataReaderFactory[UnsafeRow]] = reader match {
    case r: SupportsScanUnsafeRow => r.createUnsafeRowReaderFactories()
    case _ =>
      reader.createDataReaderFactories().asScala.map {
        new RowToUnsafeRowDataReaderFactory(_, reader.readSchema()): DataReaderFactory[UnsafeRow]
      }.asJava
  }

  private lazy val inputRDD: RDD[InternalRow] = reader match {
    case r: SupportsScanColumnarBatch if r.enableBatchRead() =>
      assert(!reader.isInstanceOf[ContinuousReader],
        "continuous stream reader does not support columnar read yet.")
      new DataSourceRDD(sparkContext, r.createBatchDataReaderFactories())
        .asInstanceOf[RDD[InternalRow]]

    case _: ContinuousReader =>
      EpochCoordinatorRef.get(
          sparkContext.getLocalProperty(ContinuousExecution.EPOCH_COORDINATOR_ID_KEY),
          sparkContext.env)
        .askSync[Unit](SetReaderPartitions(readerFactories.size()))
      new ContinuousDataSourceRDD(sparkContext, sqlContext, readerFactories)
        .asInstanceOf[RDD[InternalRow]]

    case _ =>
      new DataSourceRDD(sparkContext, readerFactories).asInstanceOf[RDD[InternalRow]]
  }

  override def inputRDDs(): Seq[RDD[InternalRow]] = Seq(inputRDD)

  override val supportsBatch: Boolean = reader match {
    case r: SupportsScanColumnarBatch if r.enableBatchRead() => true
    case _ => false
  }

  override protected def needsUnsafeRowConversion: Boolean = false

  override protected def doExecute(): RDD[InternalRow] = {
    if (supportsBatch) {
      WholeStageCodegenExec(this)(codegenStageId = 0).execute()
    } else {
      val numOutputRows = longMetric("numOutputRows")
      inputRDD.map { r =>
        numOutputRows += 1
        r
      }
    }
  }
}

class RowToUnsafeRowDataReaderFactory(rowReaderFactory: DataReaderFactory[Row], schema: StructType)
  extends DataReaderFactory[UnsafeRow] {

  override def preferredLocations: Array[String] = rowReaderFactory.preferredLocations

  override def createDataReader: DataReader[UnsafeRow] = {
    new RowToUnsafeDataReader(
      rowReaderFactory.createDataReader, RowEncoder.apply(schema).resolveAndBind())
  }
}

class RowToUnsafeDataReader(val rowReader: DataReader[Row], encoder: ExpressionEncoder[Row])
  extends DataReader[UnsafeRow] {

  override def next: Boolean = rowReader.next

  override def get: UnsafeRow = encoder.toRow(rowReader.get).asInstanceOf[UnsafeRow]

  override def close(): Unit = rowReader.close()
} 
Example 18
Source File: ReduceAggregatorSuite.scala    From Spark-2.3.1   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)])
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == ((true, 1)))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == ((true, 3)))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == ((true, 6)))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == ((true, 1)))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == ((true, 3)))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == ((true, 4)))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 19
Source File: SparkAvroDecoder.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark.avro

import org.apache.log4j.Logger

import java.io.ByteArrayOutputStream

import scala.reflect.runtime.universe._

import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord }
import org.apache.avro.io.{ DecoderFactory, EncoderFactory }
import org.apache.spark.sql.{ Dataset, Encoder, Row }
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.catalyst.expressions.GenericRow
import org.apache.spark.sql.types.StructType
import org.apache.avro.Schema

import cloudflow.spark.sql.SQLImplicits._

case class EncodedKV(key: String, value: Array[Byte])

case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) {

  val encoder: Encoder[T]                           = implicitly[Encoder[T]]
  val sqlSchema: StructType                         = encoder.schema
  val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema)
  @transient lazy val _avroSchema                   = new Schema.Parser().parse(avroSchema)
  @transient lazy val rowConverter                  = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema)
  @transient lazy val datumReader                   = new GenericDatumReader[GenericRecord](_avroSchema)
  @transient lazy val decoder                       = DecoderFactory.get
  def decode(bytes: Array[Byte]): Row = {
    val binaryDecoder = decoder.binaryDecoder(bytes, null)
    val record        = datumReader.read(null, binaryDecoder)
    rowConverter(record).asInstanceOf[GenericRow]
  }

}


case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) {

  @transient lazy val log = Logger.getLogger(getClass.getName)

  val BufferSize = 5 * 1024 // 5 Kb

  val encoder                     = implicitly[Encoder[T]]
  val sqlSchema                   = encoder.schema
  @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema)

  val recordName                = "topLevelRecord" // ???
  val recordNamespace           = "recordNamespace" // ???
  @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace)

  // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage
  def rowToBytes(row: Row): Array[Byte] = {
    val genRecord = converter(row).asInstanceOf[GenericRecord]
    if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord")
    val datumWriter   = new GenericDatumWriter[GenericRecord](_avroSchema)
    val avroEncoder   = EncoderFactory.get
    val byteArrOS     = new ByteArrayOutputStream(BufferSize)
    val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null)
    datumWriter.write(genRecord, binaryEncoder)
    binaryEncoder.flush()
    byteArrOS.toByteArray
  }

  def encode(dataset: Dataset[T]): Dataset[Array[Byte]] =
    dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]]

  // Note to self: I'm not sure how heavy this chain of transformations is
  def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = {
    val encoder             = encoderFor[T]
    implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind()
    dataset.map { value ⇒
      val key         = keyFun(value)
      val internalRow = encoder.toRow(value)
      val row         = rowEncoder.fromRow(internalRow)
      val bytes       = rowToBytes(row)
      EncodedKV(key, bytes)
    }
  }

} 
Example 20
Source File: SparkStreamletContextImpl.scala    From cloudflow   with Apache License 2.0 5 votes vote down vote up
package cloudflow.spark.kafka

import java.io.File

import com.typesafe.config.Config
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.encoders.{ ExpressionEncoder, RowEncoder }
import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery }
import cloudflow.spark.SparkStreamletContext
import cloudflow.spark.avro.{ SparkAvroDecoder, SparkAvroEncoder }
import cloudflow.spark.sql.SQLImplicits._
import cloudflow.streamlets._

import scala.reflect.runtime.universe._

class SparkStreamletContextImpl(
    private[cloudflow] override val streamletDefinition: StreamletDefinition,
    session: SparkSession,
    override val config: Config
) extends SparkStreamletContext(streamletDefinition, session) {

  val storageDir           = config.getString("storage.mountPath")
  val maxOffsetsPerTrigger = config.getLong("cloudflow.spark.read.options.max-offsets-per-trigger")
  def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] = {

    implicit val inRowEncoder: ExpressionEncoder[Row] = RowEncoder(encoder.schema)
    val schema                                        = inPort.schemaAsString
    val topic                                         = findTopicForPort(inPort)
    val srcTopic                                      = topic.name
    val brokers                                       = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers)
    val src: DataFrame = session.readStream
      .format("kafka")
      .option("kafka.bootstrap.servers", brokers)
      .options(kafkaConsumerMap(topic))
      .option("maxOffsetsPerTrigger", maxOffsetsPerTrigger)
      .option("subscribe", srcTopic)
      // Allow restart of stateful streamlets that may have been offline for longer than the kafka retention period.
      // This setting may result in data loss in some cases but allows for continuity of the runtime
      .option("failOnDataLoss", false)
      .option("startingOffsets", "earliest")
      .load()

    val rawDataset = src.select($"value").as[Array[Byte]]

    val dataframe: Dataset[Row] = rawDataset.mapPartitions { iter ⇒
      val avroDecoder = new SparkAvroDecoder[In](schema)
      iter.map(avroDecoder.decode)
    }(inRowEncoder)

    dataframe.as[In]
  }

  def kafkaConsumerMap(topic: Topic) = topic.kafkaConsumerProperties.map {
    case (key, value) => s"kafka.$key" -> value
  }
  def kafkaProducerMap(topic: Topic) = topic.kafkaProducerProperties.map {
    case (key, value) => s"kafka.$key" -> value
  }

  def writeStream[Out](stream: Dataset[Out], outPort: CodecOutlet[Out], outputMode: OutputMode)(implicit encoder: Encoder[Out],
                                                                                                typeTag: TypeTag[Out]): StreamingQuery = {

    val avroEncoder   = new SparkAvroEncoder[Out](outPort.schemaAsString)
    val encodedStream = avroEncoder.encodeWithKey(stream, outPort.partitioner)

    val topic     = findTopicForPort(outPort)
    val destTopic = topic.name
    val brokers   = topic.bootstrapServers.getOrElse(internalKafkaBootstrapServers)

    // metadata checkpoint directory on mount
    val checkpointLocation = checkpointDir(outPort.name)
    val queryName          = s"$streamletRef.$outPort"

    encodedStream.writeStream
      .outputMode(outputMode)
      .format("kafka")
      .queryName(queryName)
      .option("kafka.bootstrap.servers", brokers)
      .options(kafkaProducerMap(topic))
      .option("topic", destTopic)
      .option("checkpointLocation", checkpointLocation)
      .start()
  }

  def checkpointDir(dirName: String): String = {
    val baseCheckpointDir = new File(storageDir, streamletRef)
    val dir               = new File(baseCheckpointDir, dirName)
    if (!dir.exists()) {
      val created = dir.mkdirs()
      require(created, s"Could not create checkpoint directory: $dir")
    }
    dir.getAbsolutePath
  }
} 
Example 21
Source File: StandardScalerSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{ScalerResult, SparkFeaturePFASuiteBase}
import org.apache.spark.ml.feature.StandardScaler
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class StandardScalerSuite extends SparkFeaturePFASuiteBase[ScalerResult] {

  implicit val enc = ExpressionEncoder[Vector]()

  val inputPath = "data/sample_lda_libsvm_data.txt"
  val dataset = spark.read.format("libsvm").load(inputPath)

  val scaler = new StandardScaler()
    .setInputCol("features")
    .setOutputCol("scaled")
    .setWithMean(true)
    .setWithStd(true)

  override val sparkTransformer = scaler.fit(dataset)

  val result = sparkTransformer.transform(dataset)
  override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect()
  override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()

  test("StandardScaler w/o Mean and Std") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(false)
      .setWithStd(false)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("StandardScaler w/o Mean") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(false)
      .setWithStd(true)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("StandardScaler w/o Std") {
    val scaler = new StandardScaler()
      .setInputCol("features")
      .setOutputCol("scaled")
      .setWithMean(true)
      .setWithStd(false)
    val sparkTransformer = scaler.fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

} 
Example 22
Source File: PCAModelSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{Result, SparkFeaturePFASuiteBase}
import org.apache.spark.ml.feature.PCA
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class PCAModelSuite extends SparkFeaturePFASuiteBase[PCAModelResult] {

  implicit val enc = ExpressionEncoder[Vector]()

  val inputPath = "data/sample_lda_libsvm_data.txt"
  val dataset = spark.read.format("libsvm").load(inputPath)

  val pca = new PCA()
    .setInputCol("features")
    .setOutputCol("pcaFeatures")
    .setK(3)
  override val sparkTransformer = pca.fit(dataset)

  val result = sparkTransformer.transform(dataset)
  override val input = withColumnAsArray(result, pca.getInputCol).toJSON.collect()
  override val expectedOutput = withColumnAsArray(result, pca.getOutputCol).toJSON.collect()
}

case class PCAModelResult(pcaFeatures: Seq[Double]) extends Result 
Example 23
Source File: NormalizerSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.feature

import com.ibm.aardpfark.pfa.{ScalerResult, Result, SparkFeaturePFASuiteBase}
import org.apache.spark.ml.feature.Normalizer
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class NormalizerSuite extends SparkFeaturePFASuiteBase[ScalerResult] {

  implicit val enc = ExpressionEncoder[Vector]()

  val inputPath = "data/sample_lda_libsvm_data.txt"
  val dataset = spark.read.format("libsvm").load(inputPath)

  val scaler = new Normalizer()
    .setInputCol("features")
    .setOutputCol("scaled")

  override val sparkTransformer = scaler

  val result = scaler.transform(dataset)
  override val input = withColumnAsArray(result, scaler.getInputCol).toJSON.collect()
  override val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()

  test("Normalizer with P = 1") {
    val sparkTransformer = scaler.setP(1.0)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("Normalizer with P = positive infinity"){
    val sparkTransformer = scaler.setP(Double.PositiveInfinity)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

  test("Normalizer with P = 3") {
    val sparkTransformer = scaler.setP(3.0)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = withColumnAsArray(result, scaler.getOutputCol).toJSON.collect()
    parityTest(sparkTransformer, input, expectedOutput)
  }

} 
Example 24
Source File: LinearSVCSuite.scala    From aardpfark   with Apache License 2.0 5 votes vote down vote up
package com.ibm.aardpfark.spark.ml.classification

import com.ibm.aardpfark.pfa.ClassifierResult
import org.apache.spark.ml.classification.LinearSVC
import org.apache.spark.ml.linalg.Vector
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class LinearSVCSuite extends SparkClassifierPFASuiteBase[ClassifierResult] {

  val inputPath = "data/sample_libsvm_data.txt"
  val dataset = spark.read.format("libsvm").load(inputPath)
  val clf = new LinearSVC()
  override val sparkTransformer = clf.fit(dataset)
  import spark.implicits._

  implicit val mapEncoder = ExpressionEncoder[Map[String, Double]]()
  val result = sparkTransformer.transform(dataset)
  override val input = withColumnAsArray(result, clf.getFeaturesCol).toJSON.collect()
  override val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol).map {
    case Row(p: Double, raw: Vector) => (p, raw.toArray)
  }.toDF(clf.getPredictionCol, clf.getRawPredictionCol).toJSON.collect()

  // Additional tests
  test("LinearSVC w/o fitIntercept") {
    val sparkTransformer = clf.setFitIntercept(false).fit(dataset)
    val result = sparkTransformer.transform(dataset)
    val expectedOutput = result.select(clf.getPredictionCol, clf.getRawPredictionCol).map {
      case Row(p: Double, raw: Vector) => (p, raw.toArray)
    }.toDF(clf.getPredictionCol, clf.getRawPredictionCol).toJSON.collect()

    parityTest(sparkTransformer, input, expectedOutput)
  }
} 
Example 25
Source File: ReduceAggregatorSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null).asInstanceOf[(Boolean, Int)])
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == ((true, 1)))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == ((true, 3)))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == ((true, 6)))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == ((true, 1)))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == ((true, 3)))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == ((true, 4)))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 26
Source File: EliminateSerializationSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.rules.RuleExecutor

case class OtherTuple(_1: Int, _2: Int)

class EliminateSerializationSuite extends PlanTest {
  private object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("Serialization", FixedPoint(100),
        EliminateSerialization) :: Nil
  }

  implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]()
  implicit private def intEncoder = ExpressionEncoder[Int]()

  test("back to back serialization") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    val expected = input.select('obj.as("obj")).analyze
    comparePlans(optimized, expected)
  }

  test("back to back serialization with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }

  test("back to back serialization in AppendColumns") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze

    val optimized = Optimize.execute(plan)

    val expected = AppendColumnsWithObject(
      func.asInstanceOf[Any => Any],
      productEncoder[(Int, Int)].namedExpressions,
      intEncoder.namedExpressions,
      input).analyze

    comparePlans(optimized, expected)
  }

  test("back to back serialization in AppendColumns with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze

    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }
} 
Example 27
Source File: ReduceAggregator.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 28
Source File: typedaggregators.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this(x => f.call(x))
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 29
Source File: ReduceAggregatorSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null))
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == (true, 1))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == (true, 3))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == (true, 6))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == (true, 1))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == (true, 3))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == (true, 4))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 30
Source File: S2StreamQueryWriter.scala    From incubator-s2graph   with Apache License 2.0 5 votes vote down vote up
package org.apache.s2graph.spark.sql.streaming

import com.typesafe.config.ConfigFactory
import org.apache.s2graph.core.{GraphElement, JSONParser}
import org.apache.s2graph.s2jobs.S2GraphHelper
import org.apache.s2graph.spark.sql.streaming.S2SinkConfigs._
import org.apache.spark.TaskContext
import org.apache.spark.sql.Row
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.types.StructType
import play.api.libs.json.{JsObject, Json}

import scala.collection.mutable.ListBuffer
import scala.concurrent.Await
import scala.concurrent.duration.Duration
import scala.util.Try

private [sql] class S2StreamQueryWriter(
                                         serializedConf:String,
                                         schema: StructType ,
                                         commitProtocol: S2CommitProtocol
                                       ) extends Serializable with Logger {
  private val config = ConfigFactory.parseString(serializedConf)
  private val s2Graph = S2GraphHelper.getS2Graph(config)
  private val encoder: ExpressionEncoder[Row] = RowEncoder(schema).resolveAndBind()
  private val RESERVED_COLUMN = Set("timestamp", "from", "to", "label", "operation", "elem", "direction")


  def run(taskContext: TaskContext, iters: Iterator[InternalRow]): TaskCommit = {
    val taskId = s"stage-${taskContext.stageId()}, partition-${taskContext.partitionId()}, attempt-${taskContext.taskAttemptId()}"
    val partitionId= taskContext.partitionId()

    val groupedSize = getConfigString(config, S2_SINK_GROUPED_SIZE, DEFAULT_GROUPED_SIZE).toInt
    val waitTime = getConfigString(config, S2_SINK_WAIT_TIME, DEFAULT_WAIT_TIME_SECONDS).toInt

    commitProtocol.initTask()
    try {
      var list = new ListBuffer[(String, Int)]()
      val rst = iters.flatMap(rowToEdge).grouped(groupedSize).flatMap{ elements =>
        logger.debug(s"[$taskId][elements] ${elements.size} (${elements.map(e => e.toLogString).mkString(",\n")})")
        elements.groupBy(_.serviceName).foreach{ case (service, elems) =>
          list += ((service, elems.size))
        }

        val mutateF = s2Graph.mutateElements(elements, true)
        Await.result(mutateF, Duration(waitTime, "seconds"))
      }

      val (success, fail) = rst.toSeq.partition(r => r.isSuccess)
      val counter = list.groupBy(_._1).map{ case (service, t) =>
        val sum = t.toList.map(_._2).sum
        (service, sum)
      }
      logger.info(s"[$taskId] success : ${success.size}, fail : ${fail.size} ($counter)")


      commitProtocol.commitTask(TaskState(partitionId, success.size, fail.size, counter))

    } catch {
      case t: Throwable =>
        commitProtocol.abortTask(TaskState(partitionId))
        throw t
    }
  }

  private def rowToEdge(internalRow:InternalRow): Option[GraphElement] =
    S2GraphHelper.sparkSqlRowToGraphElement(s2Graph, encoder.fromRow(internalRow), schema, RESERVED_COLUMN)
} 
Example 31
Source File: GroupSortedDataset.scala    From spark-sorted   with Apache License 2.0 5 votes vote down vote up
package com.tresata.spark.sorted.sql

import scala.reflect.ClassTag

import org.apache.spark.sql.{ Column, Dataset, Encoder }
import org.apache.spark.sql.functions.col
import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder }

import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate }

object GroupSortedDataset {
  private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = {
    val key = col(dataset.columns.head)
    val valueSort = {
      val sort = sortBy(col(dataset.columns.last))
      if (reverse) sort.desc else sort.asc
    }
    new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort))
  }
}

class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable {
  def toDS: Dataset[(K, V)] = dataset

  def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f))
  }

  def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = {
    implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W])
    dataset.mapPartitions(mapStreamIterator(_)(f))
  }

  def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f)))
  }

  def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] =
    mapStreamByKey(iter => Iterator(iter.reduceLeft(f)))

  def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = {
    val wCreate = newWCreate(w)
    mapStreamByKey(_.scanLeft(wCreate())(f))
  }
} 
Example 32
Source File: EliminateMapObjectsSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.AttributeReference
import org.apache.spark.sql.catalyst.expressions.objects.Invoke
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical.{DeserializeToObject, LocalRelation, LogicalPlan}
import org.apache.spark.sql.catalyst.rules.RuleExecutor
import org.apache.spark.sql.types._

class EliminateMapObjectsSuite extends PlanTest {
  object Optimize extends RuleExecutor[LogicalPlan] {
    val batches = {
      Batch("EliminateMapObjects", FixedPoint(50),
        NullPropagation,
        SimplifyCasts,
        EliminateMapObjects) :: Nil
    }
  }

  implicit private def intArrayEncoder = ExpressionEncoder[Array[Int]]()
  implicit private def doubleArrayEncoder = ExpressionEncoder[Array[Double]]()

  test("SPARK-20254: Remove unnecessary data conversion for primitive array") {
    val intObjType = ObjectType(classOf[Array[Int]])
    val intInput = LocalRelation('a.array(ArrayType(IntegerType, false)))
    val intQuery = intInput.deserialize[Array[Int]].analyze
    val intOptimized = Optimize.execute(intQuery)
    val intExpected = DeserializeToObject(
      Invoke(intInput.output(0), "toIntArray", intObjType, Nil, true, false),
      AttributeReference("obj", intObjType, true)(), intInput)
    comparePlans(intOptimized, intExpected)

    val doubleObjType = ObjectType(classOf[Array[Double]])
    val doubleInput = LocalRelation('a.array(ArrayType(DoubleType, false)))
    val doubleQuery = doubleInput.deserialize[Array[Double]].analyze
    val doubleOptimized = Optimize.execute(doubleQuery)
    val doubleExpected = DeserializeToObject(
      Invoke(doubleInput.output(0), "toDoubleArray", doubleObjType, Nil, true, false),
      AttributeReference("obj", doubleObjType, true)(), doubleInput)
    comparePlans(doubleOptimized, doubleExpected)
  }
} 
Example 33
Source File: EliminateSerializationSuite.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.rules.RuleExecutor

case class OtherTuple(_1: Int, _2: Int)

class EliminateSerializationSuite extends PlanTest {
  private object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("Serialization", FixedPoint(100),
        EliminateSerialization) :: Nil
  }

  implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]()
  implicit private def intEncoder = ExpressionEncoder[Int]()

  test("back to back serialization") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    val expected = input.select('obj.as("obj")).analyze
    comparePlans(optimized, expected)
  }

  test("back to back serialization with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }

  test("back to back serialization in AppendColumns") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze

    val optimized = Optimize.execute(plan)

    val expected = AppendColumnsWithObject(
      func.asInstanceOf[Any => Any],
      productEncoder[(Int, Int)].namedExpressions,
      intEncoder.namedExpressions,
      input).analyze

    comparePlans(optimized, expected)
  }

  test("back to back serialization in AppendColumns with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze

    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }
} 
Example 34
Source File: ReduceAggregator.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  @transient private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 35
Source File: typedaggregators.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any])
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 36
Source File: ForeachBatchSink.scala    From XSQL   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.streaming.sources

import org.apache.spark.api.python.PythonException
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.execution.streaming.Sink
import org.apache.spark.sql.streaming.DataStreamWriter

class ForeachBatchSink[T](batchWriter: (Dataset[T], Long) => Unit, encoder: ExpressionEncoder[T])
  extends Sink {

  override def addBatch(batchId: Long, data: DataFrame): Unit = {
    val resolvedEncoder = encoder.resolveAndBind(
      data.logicalPlan.output,
      data.sparkSession.sessionState.analyzer)
    val rdd = data.queryExecution.toRdd.map[T](resolvedEncoder.fromRow)(encoder.clsTag)
    val ds = data.sparkSession.createDataset(rdd)(encoder)
    batchWriter(ds, batchId)
  }

  override def toString(): String = "ForeachBatchSink"
}



  def call(batchDF: DataFrame, batchId: Long): Unit
}

object PythonForeachBatchHelper {
  def callForeachBatch(dsw: DataStreamWriter[Row], pythonFunc: PythonForeachBatchFunction): Unit = {
    dsw.foreachBatch(pythonFunc.call _)
  }
} 
Example 37
Source File: ObjectExpressionsSuite.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.objects.Invoke
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{IntegerType, ObjectType}


class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("SPARK-16622: The returned value of the called method in Invoke can be null") {
    val inputRow = InternalRow.fromSeq(Seq((false, null)))
    val cls = classOf[Tuple2[Boolean, java.lang.Integer]]
    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
    val invoke = Invoke(inputObject, "_2", IntegerType)
    checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow)
  }

  test("MapObjects should make copies of unsafe-backed data") {
    // test UnsafeRow-backed data
    val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]]
    val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4))))
    val structExpected = new GenericArrayData(
      Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4))))
    checkEvalutionWithUnsafeProjection(
      structEncoder.serializer.head, structExpected, structInputRow)

    // test UnsafeArray-backed data
    val arrayEncoder = ExpressionEncoder[Array[Array[Int]]]
    val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4))))
    val arrayExpected = new GenericArrayData(
      Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4))))
    checkEvalutionWithUnsafeProjection(
      arrayEncoder.serializer.head, arrayExpected, arrayInputRow)

    // test UnsafeMap-backed data
    val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]]
    val mapInputRow = InternalRow.fromSeq(Seq(Array(
      Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400))))
    val mapExpected = new GenericArrayData(Seq(
      new ArrayBasedMapData(
        new GenericArrayData(Array(1, 2)),
        new GenericArrayData(Array(100, 200))),
      new ArrayBasedMapData(
        new GenericArrayData(Array(3, 4)),
        new GenericArrayData(Array(300, 400)))))
    checkEvalutionWithUnsafeProjection(
      mapEncoder.serializer.head, mapExpected, mapInputRow)
  }
} 
Example 38
Source File: Encoders.scala    From magellan   with Apache License 2.0 5 votes vote down vote up
package magellan.encoders

import magellan._
import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions._
import org.apache.spark.sql.types._

import scala.reflect._

object Encoders {

  implicit def encoderForPoint: Encoder[Point] = {
    val sqlType = new PointUDT().sqlType
    ExpressionEncoder[Point](
      schema = sqlType,
      flat = true,
      serializer = Seq(
        MagellanSerializer(
          BoundReference(0, ObjectType(classOf[Point]), nullable = true), sqlType)),
      deserializer =
        MagellanDeserializer(
          GetColumnByOrdinal(0, sqlType), classOf[Point]),
      clsTag = classTag[Point])
  }

  implicit def encoderForPolygon: Encoder[Polygon] = {
    val sqlType = new PolygonUDT().sqlType
    ExpressionEncoder[Polygon](
      schema = sqlType,
      flat = true,
      serializer = Seq(
        MagellanSerializer(
          BoundReference(0, ObjectType(classOf[Polygon]), nullable = true), sqlType)),
      deserializer =
        MagellanDeserializer(
          GetColumnByOrdinal(0, sqlType), classOf[Polygon]),
      clsTag = classTag[Polygon])
  }

  implicit def encoderForPolyLine: Encoder[PolyLine] = {
    val sqlType = new PolyLineUDT().sqlType
    ExpressionEncoder[PolyLine](
      schema = sqlType,
      flat = true,
      serializer = Seq(
        MagellanSerializer(
          BoundReference(0, ObjectType(classOf[PolyLine]), nullable = true), sqlType)),
      deserializer =
        MagellanDeserializer(
          GetColumnByOrdinal(0, sqlType), classOf[PolyLine]),
      clsTag = classTag[PolyLine])
  }

} 
Example 39
Source File: ArangoSparkSSLReadTest.scala    From arangodb-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.arangodb.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.scalatest.BeforeAndAfterAll
import org.scalatest.BeforeAndAfterEach
import org.scalatest.FunSuite
import org.scalatest.Matchers
import collection.JavaConverters._
import com.arangodb.ArangoDB
import com.arangodb.ArangoDBException
import com.arangodb.velocypack.VPackBuilder
import com.arangodb.velocypack.ValueType
import scala.reflect.ClassTag
import com.arangodb.spark.rdd.partition.ArangoPartitionerSinglePartition
import org.scalatest.Ignore

@Ignore
class ArangoSparkSSLReadTest extends FunSuite with Matchers with BeforeAndAfterAll with BeforeAndAfterEach with SharedSparkContextSSL {

  val DB = "spark_test_db"
  val COLLECTION = "spark_test_col"
  val arangoDB = new ArangoDB.Builder().build()

  override def beforeAll() {
    super.beforeAll()
    try {
      arangoDB.db(DB).drop()
    } catch {
      case e: ArangoDBException =>
    }
    arangoDB.createDatabase(DB)
    arangoDB.db(DB).createCollection(COLLECTION)
    val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) })
    ArangoSpark.save(documents, COLLECTION, WriteOptions(DB))
  }

  override def afterAll() {
    try {
      arangoDB.db(DB).drop()
      arangoDB.shutdown()
    } finally {
      super.afterAll()
    }
  }

  test("load all documents from collection") {
    val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB))
    rdd.count() should be(100)
  }

  test("load with custom partionier") {
    val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB, partitioner = new ArangoPartitionerSinglePartition()))
    rdd.count() should be(100)
  }

  test("load documents from collection with filter statement") {
    val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB))
    val rdd2 = rdd.filter("doc.test <= 50")
    rdd2.count() should be(50)
  }
} 
Example 40
Source File: ArangoSparkSSLWriteTest.scala    From arangodb-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.arangodb.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.scalatest.BeforeAndAfterAll
import org.scalatest.BeforeAndAfterEach
import org.scalatest.FunSuite
import org.scalatest.Matchers

import com.arangodb.ArangoDB
import com.arangodb.ArangoDBException
import com.arangodb.velocypack.VPackBuilder
import com.arangodb.velocypack.ValueType
import org.scalatest.Ignore

@Ignore
class ArangoSparkSSLWriteTest extends FunSuite with Matchers with BeforeAndAfterAll with BeforeAndAfterEach with SharedSparkContextSSL {

  val DB = "spark_test_db"
  val COLLECTION = "spark_test_col"
  val arangoDB = new ArangoDB.Builder().build()

  override def beforeAll() {
    super.beforeAll()
    try {
      arangoDB.db(DB).drop()
    } catch {
      case e: ArangoDBException =>
    }
    arangoDB.createDatabase(DB)
    arangoDB.db(DB).createCollection(COLLECTION)
  }

  override def afterAll() {
    try {
      arangoDB.db(DB).drop()
      arangoDB.shutdown()
    } finally {
      super.afterAll()
    }
  }

  override def afterEach() {
    arangoDB.db(DB).collection(COLLECTION).truncate()
  }

  private def checkDocumentCount(count: Int) {
    arangoDB.db(DB).collection(COLLECTION).count().getCount should be(count)
  }

  test("save RDD to ArangoDB") {
    checkDocumentCount(0)

    val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) })
    ArangoSpark.save(documents, COLLECTION, WriteOptions(DB))

    checkDocumentCount(100)
  }

  test("save RDD[VPackSlice] to ArangoDB") {
    checkDocumentCount(0)

    val documents = sc.parallelize((1 to 100).map { i => new VPackBuilder().add(ValueType.OBJECT).add("test", Integer.valueOf(i)).close().slice() })
    ArangoSpark.save(documents, COLLECTION, WriteOptions(DB))

    checkDocumentCount(100)
  }

  test("save DataFrame to ArangoDB") {
    checkDocumentCount(0)

    val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) })
    val sql: SQLContext = SQLContext.getOrCreate(sc);
    val df = sql.createDataFrame(documents, classOf[TestEntity])
    ArangoSpark.saveDF(df, COLLECTION, WriteOptions(DB))

    checkDocumentCount(100)
  }

  test("save Dataset to ArangoDB") {
    checkDocumentCount(0)

    val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) })
    val sql: SQLContext = SQLContext.getOrCreate(sc);
    val encoder = ExpressionEncoder.javaBean(classOf[TestEntity])
    val ds = sql.createDataset(documents)(encoder);
    ArangoSpark.save(ds, COLLECTION, WriteOptions(DB))

    checkDocumentCount(100)
  }

} 
Example 41
Source File: ArangoSparkReadTest.scala    From arangodb-spark-connector   with Apache License 2.0 5 votes vote down vote up
package com.arangodb.spark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.scalatest.BeforeAndAfterAll
import org.scalatest.BeforeAndAfterEach
import org.scalatest.FunSuite
import org.scalatest.Matchers
import collection.JavaConverters._
import com.arangodb.ArangoDB
import com.arangodb.ArangoDBException
import com.arangodb.velocypack.VPackBuilder
import com.arangodb.velocypack.ValueType
import scala.reflect.ClassTag
import com.arangodb.spark.rdd.partition.ArangoPartitionerSinglePartition
import org.scalatest.Ignore
import com.arangodb.entity.LoadBalancingStrategy

class ArangoSparkReadTest extends FunSuite with Matchers with BeforeAndAfterAll with BeforeAndAfterEach with SharedSparkContext {

  val DB = "spark_test_db"
  val COLLECTION = "spark_test_col"
  val arangoDB = new ArangoDB.Builder().build()

  override def beforeAll() {
    super.beforeAll()
    try {
      arangoDB.db(DB).drop()
    } catch {
      case e: ArangoDBException =>
    }
    arangoDB.createDatabase(DB)
    arangoDB.db(DB).createCollection(COLLECTION)
    val documents = sc.parallelize((1 to 100).map { i => TestEntity(i) })
    ArangoSpark.save(documents, COLLECTION, WriteOptions(DB))
  }

  override def afterAll() {
    try {
      arangoDB.db(DB).drop()
      arangoDB.shutdown()
    } finally {
      super.afterAll()
    }
  }

  test("load all documents from collection") {
    val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB))
    rdd.count() should be(100)
  }

  test("load with custom partionier") {
    val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB, partitioner = new ArangoPartitionerSinglePartition()))
    rdd.count() should be(100)
  }

  test("load documents from collection with filter statement") {
    val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB))
    val rdd2 = rdd.filter("doc.test <= 50")
    rdd2.count() should be(50)
  }
  
  test("load all documents from collection with load balancing") {
	  val rdd = ArangoSpark.load[TestEntity](sc, COLLECTION, ReadOptions(DB).acquireHostList(false).loadBalancingStrategy(LoadBalancingStrategy.ROUND_ROBIN))
	  rdd.count() should be(100)
  }

} 
Example 42
Source File: ObjectExpressionsSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.expressions.objects.Invoke
import org.apache.spark.sql.catalyst.util.{ArrayBasedMapData, GenericArrayData}
import org.apache.spark.sql.types.{IntegerType, ObjectType}


class ObjectExpressionsSuite extends SparkFunSuite with ExpressionEvalHelper {

  test("SPARK-16622: The returned value of the called method in Invoke can be null") {
    val inputRow = InternalRow.fromSeq(Seq((false, null)))
    val cls = classOf[Tuple2[Boolean, java.lang.Integer]]
    val inputObject = BoundReference(0, ObjectType(cls), nullable = true)
    val invoke = Invoke(inputObject, "_2", IntegerType)
    checkEvaluationWithGeneratedMutableProjection(invoke, null, inputRow)
  }

  test("MapObjects should make copies of unsafe-backed data") {
    // test UnsafeRow-backed data
    val structEncoder = ExpressionEncoder[Array[Tuple2[java.lang.Integer, java.lang.Integer]]]
    val structInputRow = InternalRow.fromSeq(Seq(Array((1, 2), (3, 4))))
    val structExpected = new GenericArrayData(
      Array(InternalRow.fromSeq(Seq(1, 2)), InternalRow.fromSeq(Seq(3, 4))))
    checkEvalutionWithUnsafeProjection(
      structEncoder.serializer.head, structExpected, structInputRow)

    // test UnsafeArray-backed data
    val arrayEncoder = ExpressionEncoder[Array[Array[Int]]]
    val arrayInputRow = InternalRow.fromSeq(Seq(Array(Array(1, 2), Array(3, 4))))
    val arrayExpected = new GenericArrayData(
      Array(new GenericArrayData(Array(1, 2)), new GenericArrayData(Array(3, 4))))
    checkEvalutionWithUnsafeProjection(
      arrayEncoder.serializer.head, arrayExpected, arrayInputRow)

    // test UnsafeMap-backed data
    val mapEncoder = ExpressionEncoder[Array[Map[Int, Int]]]
    val mapInputRow = InternalRow.fromSeq(Seq(Array(
      Map(1 -> 100, 2 -> 200), Map(3 -> 300, 4 -> 400))))
    val mapExpected = new GenericArrayData(Seq(
      new ArrayBasedMapData(
        new GenericArrayData(Array(1, 2)),
        new GenericArrayData(Array(100, 200))),
      new ArrayBasedMapData(
        new GenericArrayData(Array(3, 4)),
        new GenericArrayData(Array(300, 400)))))
    checkEvalutionWithUnsafeProjection(
      mapEncoder.serializer.head, mapExpected, mapInputRow)
  }
} 
Example 43
Source File: EliminateSerializationSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.catalyst.optimizer

import scala.reflect.runtime.universe.TypeTag

import org.apache.spark.sql.catalyst.dsl.expressions._
import org.apache.spark.sql.catalyst.dsl.plans._
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.catalyst.plans.logical._
import org.apache.spark.sql.catalyst.plans.PlanTest
import org.apache.spark.sql.catalyst.rules.RuleExecutor

case class OtherTuple(_1: Int, _2: Int)

class EliminateSerializationSuite extends PlanTest {
  private object Optimize extends RuleExecutor[LogicalPlan] {
    val batches =
      Batch("Serialization", FixedPoint(100),
        EliminateSerialization) :: Nil
  }

  implicit private def productEncoder[T <: Product : TypeTag] = ExpressionEncoder[T]()
  implicit private def intEncoder = ExpressionEncoder[Int]()

  test("back to back serialization") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val plan = input.serialize[(Int, Int)].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    val expected = input.select('obj.as("obj")).analyze
    comparePlans(optimized, expected)
  }

  test("back to back serialization with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val plan = input.serialize[OtherTuple].deserialize[(Int, Int)].analyze
    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }

  test("back to back serialization in AppendColumns") {
    val input = LocalRelation('obj.obj(classOf[(Int, Int)]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[(Int, Int)]).analyze

    val optimized = Optimize.execute(plan)

    val expected = AppendColumnsWithObject(
      func.asInstanceOf[Any => Any],
      productEncoder[(Int, Int)].namedExpressions,
      intEncoder.namedExpressions,
      input).analyze

    comparePlans(optimized, expected)
  }

  test("back to back serialization in AppendColumns with object change") {
    val input = LocalRelation('obj.obj(classOf[OtherTuple]))
    val func = (item: (Int, Int)) => item._1
    val plan = AppendColumns(func, input.serialize[OtherTuple]).analyze

    val optimized = Optimize.execute(plan)
    comparePlans(optimized, plan)
  }
} 
Example 44
Source File: ReduceAggregator.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.sql.Encoder
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder


private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T)
  extends Aggregator[T, (Boolean, T), T] {

  private val encoder = implicitly[Encoder[T]]

  override def zero: (Boolean, T) = (false, null.asInstanceOf[T])

  override def bufferEncoder: Encoder[(Boolean, T)] =
    ExpressionEncoder.tuple(
      ExpressionEncoder[Boolean](),
      encoder.asInstanceOf[ExpressionEncoder[T]])

  override def outputEncoder: Encoder[T] = encoder

  override def reduce(b: (Boolean, T), a: T): (Boolean, T) = {
    if (b._1) {
      (true, func(b._2, a))
    } else {
      (true, a)
    }
  }

  override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = {
    if (!b1._1) {
      b2
    } else if (!b2._1) {
      b1
    } else {
      (true, func(b1._2, b2._2))
    }
  }

  override def finish(reduction: (Boolean, T)): T = {
    if (!reduction._1) {
      throw new IllegalStateException("ReduceAggregator requires at least one input row")
    }
    reduction._2
  }
} 
Example 45
Source File: typedaggregators.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.execution.aggregate

import org.apache.spark.api.java.function.MapFunction
import org.apache.spark.sql.{Encoder, TypedColumn}
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder
import org.apache.spark.sql.expressions.Aggregator

////////////////////////////////////////////////////////////////////////////////////////////////////
// This file defines internal implementations for aggregators.
////////////////////////////////////////////////////////////////////////////////////////////////////


class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] {
  override def zero: Double = 0.0
  override def reduce(b: Double, a: IN): Double = b + f(a)
  override def merge(b1: Double, b2: Double): Double = b1 + b2
  override def finish(reduction: Double): Double = reduction

  override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])

  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
}


class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0L
  override def reduce(b: Long, a: IN): Long = b + f(a)
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long])

  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] {
  override def zero: Long = 0
  override def reduce(b: Long, a: IN): Long = {
    if (f(a) == null) b else b + 1
  }
  override def merge(b1: Long, b2: Long): Long = b1 + b2
  override def finish(reduction: Long): Long = reduction

  override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]()
  override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]()

  // Java api support
  def this(f: MapFunction[IN, Object]) = this(x => f.call(x))
  def toColumnJava: TypedColumn[IN, java.lang.Long] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]]
  }
}


class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] {
  override def zero: (Double, Long) = (0.0, 0L)
  override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2)
  override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2
  override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = {
    (b1._1 + b2._1, b1._2 + b2._2)
  }

  override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]()
  override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]()

  // Java api support
  def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double])
  def toColumnJava: TypedColumn[IN, java.lang.Double] = {
    toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]]
  }
} 
Example 46
Source File: ReduceAggregatorSuite.scala    From sparkoscope   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.sql.expressions

import org.apache.spark.SparkFunSuite
import org.apache.spark.sql.Encoders
import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder

class ReduceAggregatorSuite extends SparkFunSuite {

  test("zero value") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)
    assert(aggregator.zero == (false, null))
  }

  test("reduce, merge and finish") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    val firstReduce = aggregator.reduce(aggregator.zero, 1)
    assert(firstReduce == (true, 1))

    val secondReduce = aggregator.reduce(firstReduce, 2)
    assert(secondReduce == (true, 3))

    val thirdReduce = aggregator.reduce(secondReduce, 3)
    assert(thirdReduce == (true, 6))

    val mergeWithZero1 = aggregator.merge(aggregator.zero, firstReduce)
    assert(mergeWithZero1 == (true, 1))

    val mergeWithZero2 = aggregator.merge(secondReduce, aggregator.zero)
    assert(mergeWithZero2 == (true, 3))

    val mergeTwoReduced = aggregator.merge(firstReduce, secondReduce)
    assert(mergeTwoReduced == (true, 4))

    assert(aggregator.finish(firstReduce)== 1)
    assert(aggregator.finish(secondReduce) == 3)
    assert(aggregator.finish(thirdReduce) == 6)
    assert(aggregator.finish(mergeWithZero1) == 1)
    assert(aggregator.finish(mergeWithZero2) == 3)
    assert(aggregator.finish(mergeTwoReduced) == 4)
  }

  test("requires at least one input row") {
    val encoder: ExpressionEncoder[Int] = ExpressionEncoder()
    val func = (v1: Int, v2: Int) => v1 + v2
    val aggregator: ReduceAggregator[Int] = new ReduceAggregator(func)(Encoders.scalaInt)

    intercept[IllegalStateException] {
      aggregator.finish(aggregator.zero)
    }
  }
} 
Example 47
Source File: RowStreamParserImp.scala    From carbondata   with Apache License 2.0 5 votes vote down vote up
package org.apache.carbondata.streaming.parser

import java.text.SimpleDateFormat
import java.util

import org.apache.hadoop.conf.Configuration
import org.apache.spark.sql.catalyst.encoders.{ExpressionEncoder, RowEncoder}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType

import org.apache.carbondata.core.constants.CarbonCommonConstants
import org.apache.carbondata.processing.loading.ComplexDelimitersEnum
import org.apache.carbondata.processing.loading.constants.DataLoadProcessorConstants


class RowStreamParserImp extends CarbonStreamParser {

  var configuration: Configuration = null
  var isVarcharTypeMapping: Array[Boolean] = null
  var structType: StructType = null
  var encoder: ExpressionEncoder[Row] = null

  var timeStampFormat: SimpleDateFormat = null
  var dateFormat: SimpleDateFormat = null
  var complexDelimiters: util.ArrayList[String] = new util.ArrayList[String]()
  var serializationNullFormat: String = null

  override def initialize(configuration: Configuration,
      structType: StructType, isVarcharTypeMapping: Array[Boolean]): Unit = {
    this.configuration = configuration
    this.structType = structType
    this.encoder = RowEncoder.apply(this.structType).resolveAndBind()
    this.isVarcharTypeMapping = isVarcharTypeMapping

    this.timeStampFormat = new SimpleDateFormat(
      this.configuration.get(CarbonCommonConstants.CARBON_TIMESTAMP_FORMAT))
    this.dateFormat = new SimpleDateFormat(
      this.configuration.get(CarbonCommonConstants.CARBON_DATE_FORMAT))
    this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_1"))
    this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_2"))
    this.complexDelimiters.add(this.configuration.get("carbon_complex_delimiter_level_3"))
    this.complexDelimiters.add(ComplexDelimitersEnum.COMPLEX_DELIMITERS_LEVEL_4.value())
    this.serializationNullFormat =
      this.configuration.get(DataLoadProcessorConstants.SERIALIZATION_NULL_FORMAT)
  }

  override def parserRow(value: InternalRow): Array[Object] = {
    this.encoder.fromRow(value).toSeq.zipWithIndex.map { case (x, i) =>
      FieldConverter.objectToString(
        x, serializationNullFormat, complexDelimiters,
        timeStampFormat, dateFormat,
        isVarcharType = i < this.isVarcharTypeMapping.length && this.isVarcharTypeMapping(i),
        binaryCodec = null)
    } }.toArray

  override def close(): Unit = {
  }

}