org.apache.spark.sql.Encoder Scala Examples
The following examples show how to use org.apache.spark.sql.Encoder.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RichSparkFunctions.scala From lighthouse with Apache License 2.0 | 5 votes |
package be.dataminded.lighthouse.pipeline import com.typesafe.scalalogging.LazyLogging import org.apache.spark.sql.{Dataset, Encoder} import org.apache.spark.storage.StorageLevel import scala.reflect.ClassTag object RichSparkFunctions extends LazyLogging { class DatasetSparkFunction[A <: Dataset[_]: ClassTag](function: SparkFunction[A]) { def printSchema(): SparkFunction[A] = function.map { dataSet => dataSet.printSchema() dataSet } def as[T: Encoder]: SparkFunction[Dataset[T]] = function.map(_.as[T]) def cache(storageLevel: StorageLevel = StorageLevel.MEMORY_ONLY): SparkFunction[A] = function.map { _.persist(storageLevel) } def dropCache(): SparkFunction[A] = function.map { _.unpersist() } def write(sink: Sink, sinks: Sink*): SparkFunction[A] = { if (sinks.isEmpty) function.map { data => sink.write(data); data } else (sink +: sinks).foldLeft(function.cache())((f, sink) => f.write(sink)) } def count(): SparkFunction[Long] = { function.map { dataSet => val n = dataSet.count() logger.debug(s"The data set produced $n rows") n } } } }
Example 2
Source File: ReduceAggregator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 3
Source File: Aggregator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 4
Source File: typedaggregators.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this(x => f.call(x)) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 5
Source File: TopByKeyAggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.recommendation import scala.language.implicitConversions import scala.reflect.runtime.universe.TypeTag import org.apache.spark.sql.{Encoder, Encoders} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator import org.apache.spark.util.BoundedPriorityQueue private[recommendation] class TopByKeyAggregator[K1: TypeTag, K2: TypeTag, V: TypeTag] (num: Int, ord: Ordering[(K2, V)]) extends Aggregator[(K1, K2, V), BoundedPriorityQueue[(K2, V)], Array[(K2, V)]] { override def zero: BoundedPriorityQueue[(K2, V)] = new BoundedPriorityQueue[(K2, V)](num)(ord) override def reduce( q: BoundedPriorityQueue[(K2, V)], a: (K1, K2, V)): BoundedPriorityQueue[(K2, V)] = { q += {(a._2, a._3)} } override def merge( q1: BoundedPriorityQueue[(K2, V)], q2: BoundedPriorityQueue[(K2, V)]): BoundedPriorityQueue[(K2, V)] = { q1 ++= q2 } override def finish(r: BoundedPriorityQueue[(K2, V)]): Array[(K2, V)] = { r.toArray.sorted(ord.reverse) } override def bufferEncoder: Encoder[BoundedPriorityQueue[(K2, V)]] = { Encoders.kryo[BoundedPriorityQueue[(K2, V)]] } override def outputEncoder: Encoder[Array[(K2, V)]] = ExpressionEncoder[Array[(K2, V)]]() }
Example 6
Source File: ReduceAggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { @transient private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 7
Source File: Aggregator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 8
Source File: typedaggregators.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 9
Source File: Aggregator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression import org.apache.spark.sql.{DataFrame, Dataset, Encoder, TypedColumn} def toColumn( implicit bEncoder: Encoder[B], cEncoder: Encoder[O]): TypedColumn[I, O] = { val expr = new AggregateExpression( TypedAggregateExpression(this), Complete, false) new TypedColumn[I, O](expr, encoderFor[O]) } }
Example 10
Source File: A_9_MyAverageByAggregator.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.sql import org.apache.spark.sql.{Encoder, Encoders, SparkSession} import org.apache.spark.sql.expressions.Aggregator case class Employee(name:String, salary:Long) case class Average(var sum:Long, var count:Long) object A_9_MyAverageByAggregator extends Aggregator[Employee, Average, Double]{ override def zero: Average = Average(0L,0L) override def reduce(b: Average, a: Employee): Average = { b.sum += a.salary b.count+=1 b } override def merge(b1: Average, b2: Average): Average = { b1.count+=b2.count b1.sum+=b2.sum b1 } override def finish(reduction: Average): Double = reduction.sum.toDouble/reduction.count override def bufferEncoder: Encoder[Average] = Encoders.product override def outputEncoder: Encoder[Double] = Encoders.scalaDouble def main(args: Array[String]): Unit = { val sparkSession = SparkSession.builder().master("local").appName("MyAverageByAggregator") .getOrCreate() //隐式转换 import sparkSession.implicits._ val dataFrame = sparkSession.read.json("src/main/resources/sparkresource/employees.json").as[Employee] dataFrame.show() val salary_average = A_9_MyAverageByAggregator.toColumn.name("salary_average") val frame = dataFrame.select(salary_average) frame.show() } }
Example 11
Source File: JCUDACodegenIterator.scala From GPUEnabler with Apache License 2.0 | 5 votes |
package com.ibm.gpuenabler import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.Encoder abstract class JCUDACodegenIterator extends Iterator[InternalRow] { def hasNext() : Boolean def next() : InternalRow def init[T](itr : java.util.Iterator[InternalRow], args: Array[Any], size : Int, cached: Int, gpuPtrs: java.util.List[java.util.Map[String, CachedGPUMeta]], blockID: Int, userGridSizes: Array[Array[Int]], userBlockSizes: Array[Array[Int]], stages: Int, smSize: Int, inpEnc: Encoder[T]) }
Example 12
Source File: SparkEgressSpec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.streaming.{ OutputMode, Trigger } import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkEgressSpec extends SparkScalaTestSupport { "SparkEgress" should { "materialize streaming data to sink" in { val testKit = SparkStreamletTestkit(session) def asCollection[T: Encoder](session: SparkSession, queryName: String): List[T] = session.sql(s"select * from $queryName").as[T].collect().toList val instance = new MySparkEgress() // setup inlet tap on inlet port val in: SparkInletTap[Data] = testKit.inletAsTap[Data](instance.in) // build data and send to inlet tap val data = (1 to 10).map(i ⇒ Data(i, s"name$i")) in.addData(data) val run = testKit.run(instance, Seq(in), Seq.empty) run.failures mustBe ('empty) run.totalRows mustBe (20) val r1 = asCollection[String](session, "allNames") val r2 = asCollection[String](session, "allNamesUpper") // assert r1 must contain("name1") r2 must contain("NAME1") } } } class MySparkEgress extends SparkStreamlet { val in = AvroInlet[Data]("in") val shape = StreamletShape(in) override def createLogic() = new SparkStreamletLogic { override def buildStreamingQueries = process(readStream(in)) private def process(inDataset: Dataset[Data]): StreamletQueryExecution = { val q1 = inDataset .map { d ⇒ d.name } .writeStream .format("memory") .option("truncate", false) .queryName("allNames") .outputMode(OutputMode.Append()) .trigger(Trigger.Once) .start() val q2 = inDataset .map { d ⇒ d.name.toUpperCase } .writeStream .format("memory") .option("truncate", false) .queryName("allNamesUpper") .outputMode(OutputMode.Append()) .trigger(Trigger.Once) .start() StreamletQueryExecution(q1, q2) } } }
Example 13
Source File: SparkAvroDecoder.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark.avro import org.apache.log4j.Logger import java.io.ByteArrayOutputStream import scala.reflect.runtime.universe._ import org.apache.avro.generic.{ GenericDatumReader, GenericDatumWriter, GenericRecord } import org.apache.avro.io.{ DecoderFactory, EncoderFactory } import org.apache.spark.sql.{ Dataset, Encoder, Row } import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder, RowEncoder } import org.apache.spark.sql.catalyst.expressions.GenericRow import org.apache.spark.sql.types.StructType import org.apache.avro.Schema import cloudflow.spark.sql.SQLImplicits._ case class EncodedKV(key: String, value: Array[Byte]) case class SparkAvroDecoder[T: Encoder: TypeTag](avroSchema: String) { val encoder: Encoder[T] = implicitly[Encoder[T]] val sqlSchema: StructType = encoder.schema val encoderForDataColumns: ExpressionEncoder[Row] = RowEncoder(sqlSchema) @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) @transient lazy val rowConverter = SchemaConverters.createConverterToSQL(_avroSchema, sqlSchema) @transient lazy val datumReader = new GenericDatumReader[GenericRecord](_avroSchema) @transient lazy val decoder = DecoderFactory.get def decode(bytes: Array[Byte]): Row = { val binaryDecoder = decoder.binaryDecoder(bytes, null) val record = datumReader.read(null, binaryDecoder) rowConverter(record).asInstanceOf[GenericRow] } } case class SparkAvroEncoder[T: Encoder: TypeTag](avroSchema: String) { @transient lazy val log = Logger.getLogger(getClass.getName) val BufferSize = 5 * 1024 // 5 Kb val encoder = implicitly[Encoder[T]] val sqlSchema = encoder.schema @transient lazy val _avroSchema = new Schema.Parser().parse(avroSchema) val recordName = "topLevelRecord" // ??? val recordNamespace = "recordNamespace" // ??? @transient lazy val converter = AvroConverter.createConverterToAvro(sqlSchema, recordName, recordNamespace) // Risk: This process is memory intensive. Might require thread-level buffers to optimize memory usage def rowToBytes(row: Row): Array[Byte] = { val genRecord = converter(row).asInstanceOf[GenericRecord] if (log.isDebugEnabled) log.debug(s"genRecord = $genRecord") val datumWriter = new GenericDatumWriter[GenericRecord](_avroSchema) val avroEncoder = EncoderFactory.get val byteArrOS = new ByteArrayOutputStream(BufferSize) val binaryEncoder = avroEncoder.binaryEncoder(byteArrOS, null) datumWriter.write(genRecord, binaryEncoder) binaryEncoder.flush() byteArrOS.toByteArray } def encode(dataset: Dataset[T]): Dataset[Array[Byte]] = dataset.toDF().mapPartitions(rows ⇒ rows.map(rowToBytes)).as[Array[Byte]] // Note to self: I'm not sure how heavy this chain of transformations is def encodeWithKey(dataset: Dataset[T], keyFun: T ⇒ String): Dataset[EncodedKV] = { val encoder = encoderFor[T] implicit val rowEncoder = RowEncoder(encoder.schema).resolveAndBind() dataset.map { value ⇒ val key = keyFun(value) val internalRow = encoder.toRow(value) val row = rowEncoder.fromRow(internalRow) val bytes = rowToBytes(row) EncodedKV(key, bytes) } } }
Example 14
Source File: TestSparkStreamletContext.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark package testkit import java.nio.file.attribute.FileAttribute import com.typesafe.config._ import scala.reflect.runtime.universe._ import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger } import cloudflow.streamlets._ import org.apache.spark.sql.catalyst.InternalRow class TestSparkStreamletContext(override val streamletRef: String, session: SparkSession, inletTaps: Seq[SparkInletTap[_]], outletTaps: Seq[SparkOutletTap[_]], override val config: Config = ConfigFactory.empty) extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config), session) { val ProcessingTimeInterval = 1500.milliseconds override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] = inletTaps .find(_.portName == inPort.name) .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In]) .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}")) override def writeStream[Out](stream: Dataset[Out], outPort: CodecOutlet[Out], outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = { // RateSource can only work with a microBatch query because it contains no data at time zero. // Trigger.Once requires data at start to work. val trigger = if (isRateSource(stream)) { Trigger.ProcessingTime(ProcessingTimeInterval) } else { Trigger.Once() } val streamingQuery = outletTaps .find(_.portName == outPort.name) .map { outletTap ⇒ stream.writeStream .outputMode(outputMode) .format("memory") .trigger(trigger) .queryName(outletTap.queryName) .start() } .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}")) streamingQuery } override def checkpointDir(dirName: String): String = { val fileAttibutes: Array[FileAttribute[_]] = Array() val tmpDir = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*) tmpDir.toFile.getAbsolutePath } private def isRateSource(stream: Dataset[_]): Boolean = { import org.apache.spark.sql.execution.command.ExplainCommand val explain = ExplainCommand(stream.queryExecution.logical, true) val res = session.sessionState.executePlan(explain).executedPlan.executeCollect() res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider")) } } case class TestContextException(portName: String, msg: String) extends RuntimeException(msg)
Example 15
Source File: SecondaryPairDCFunctions.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.bloomberg.sparkflow.partitioning.SecondarySortPartioner import org.apache.spark.rdd.RDD import org.apache.spark.sql.Encoder import scala.reflect.ClassTag class SecondaryPairDCFunctions[K, K2, V](self: DC[((K, K2), V)]) (implicit kt: ClassTag[K], vt: ClassTag[V], k2t: ClassTag[K2], ord: Ordering[(K, K2)] = null, encoder: Encoder[((K, K2), V)]) { def repartitionAndSecondarySortWithinPartitions(): DC[((K, K2), V)] = { new RDDTransformDC(encoder, self, (rdd: RDD[((K, K2), V)]) => rdd.repartitionAndSortWithinPartitions(new SecondarySortPartioner[K, K2, V](rdd.partitions.length)), Seq("repartAndSort")) } def repartitionAndSecondarySortWithinPartitions(numPartitions: Int): DC[((K, K2), V)] = { new RDDTransformDC(encoder, self, (rdd: RDD[((K, K2), V)]) => rdd.repartitionAndSortWithinPartitions(new SecondarySortPartioner[K, K2, V](numPartitions)), Seq("repartAndSort", numPartitions.toString)) } }
Example 16
Source File: DatasetTransformDC.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.bloomberg.sparkflow.serialization.Hashing._ import org.apache.spark.sql.{Dataset, Encoder, SparkSession} private[sparkflow] class DatasetTransformDC[U, T] (encoder: Encoder[U], val prev: DC[T], f: (Dataset[T]) => Dataset[U], hashTargets: Seq[String]) extends DC[U](encoder, Seq(prev)) { // // def this(prev: DC[T], f: Dataset[T] => Dataset[U], hashTarget: AnyRef)(implicit tEncoder: Encoder[T], uEncoder: Encoder[U]) = { // this(prev, uEncoder, f, Seq(hashClass(hashTarget))) // } // // def this(prev: DC[T], f: Dataset[T] => Dataset[U], hashTarget: AnyRef, hashTargets: Seq[String])(implicit tEncoder: Encoder[T], uEncoder: Encoder[U]) = { // this(prev,uEncoder, f, hashClass(hashTarget) +: hashTargets) // } def computeDataset(spark: SparkSession) = { val dataset = f(prev.getDataset(spark)) dataset } override def computeSignature() = { hashString(prev.getSignature + hashSeq(hashTargets)) } }
Example 17
Source File: MultiGroupedTransformDC.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.bloomberg.sparkflow.serialization.Hashing import org.apache.spark.sql.{Dataset, Encoder, KeyValueGroupedDataset, SparkSession} import scala.concurrent.duration.Duration import scala.concurrent.{Await, Future} import scala.reflect.ClassTag import scala.concurrent.ExecutionContext.Implicits.global class MultiGroupedTransformDC[K, V, U, T: ClassTag] (left: KeyValueGroupedDC[K, V], right: KeyValueGroupedDC[K, U], f: (KeyValueGroupedDataset[K, V], KeyValueGroupedDataset[K, U]) => Dataset[T]) (implicit tEncoder: Encoder[T]) extends DC[T](tEncoder, Seq(left, right)) { override def computeDataset(spark: SparkSession) = { val leftFuture = Future{left.get(spark)} val rightFuture = Future{right.get(spark)} val ld = Await.result(leftFuture, Duration.Inf) val rd = Await.result(rightFuture, Duration.Inf) val dataset = f(ld, rd) dataset } override def computeSignature() = { Hashing.hashString(left.getSignature + right.getSignature + Hashing.hashClass(f)) } }
Example 18
Source File: MultiInputDC.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.bloomberg.sparkflow.serialization.Hashing import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, SparkSession} import scala.concurrent.duration.Duration import scala.concurrent.ExecutionContext.Implicits.global import scala.concurrent.{Await, Future} class MultiInputDC[T, U, V](encoder: Encoder[T], left: DC[U], right: DC[V], f: (RDD[U], RDD[V]) => RDD[T]) extends DC[T](encoder, Seq(left, right)) { override def computeSignature() = { Hashing.hashString(left.getSignature + right.getSignature + Hashing.hashClass(f)) } override def computeDataset(spark: SparkSession) = { val leftFuture = Future{left.getRDD(spark)} val rightFuture = Future{right.getRDD(spark)} val leftRDD = Await.result(leftFuture, Duration.Inf) val rightRDD = Await.result(rightFuture, Duration.Inf) val rdd = f(leftRDD, rightRDD) spark.createDataset(rdd) } }
Example 19
Source File: ResultDepDC.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import com.bloomberg.sparkflow.serialization.Hashing import com.bloomberg.sparkflow.serialization.Hashing._ import org.apache.spark.sql.{Encoder, SparkSession} class ResultDepDC[U, T] (encoder: Encoder[(T, U)], val prev: DC[T], dr: DR[U]) extends DC[(T, U)](encoder, Seq(prev, dr)) { override def computeDataset(spark: SparkSession) = { val result = dr.get(spark) prev.getDataset(spark).mapPartitions(iterator => { iterator.map(t => (t, result)) }) } override def computeSignature() = { hashString(prev.getSignature + dr.getSignature) } }
Example 20
Source File: ExistingRDD.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.catalyst.plans.physical.{Partitioning, UnknownPartitioning} import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], name: String, override val outputPartitioning: Partitioning = UnknownPartitioning(0), override val outputOrdering: Seq[SortOrder] = Nil) extends LeafExecNode { private def rddName: String = Option(rdd.name).map(n => s" $n").getOrElse("") override val nodeName: String = s"Scan $name$rddName" override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsWithIndexInternal { (index, iter) => val proj = UnsafeProjection.create(schema) proj.initialize(index) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"$nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }
Example 21
Source File: ReduceAggregator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 22
Source File: Aggregator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 23
Source File: typedaggregators.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this(x => f.call(x)) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 24
Source File: ExistingRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Encoder, Row, SparkSession} import org.apache.spark.sql.catalyst.{CatalystTypeConverters, InternalRow} import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.catalyst.plans.logical._ import org.apache.spark.sql.execution.datasources._ import org.apache.spark.sql.execution.metric.SQLMetrics import org.apache.spark.sql.types.DataType import org.apache.spark.util.Utils object RDDConversions { def productToRowRdd[A <: Product](data: RDD[A], outputTypes: Seq[DataType]): RDD[InternalRow] = { data.mapPartitions { iterator => val numColumns = outputTypes.length val mutableRow = new GenericInternalRow(numColumns) val converters = outputTypes.map(CatalystTypeConverters.createToCatalystConverter) iterator.map { r => var i = 0 while (i < numColumns) { mutableRow(i) = converters(i)(r.productElement(i)) i += 1 } mutableRow } } } case class RDDScanExec( output: Seq[Attribute], rdd: RDD[InternalRow], override val nodeName: String) extends LeafExecNode { override lazy val metrics = Map( "numOutputRows" -> SQLMetrics.createMetric(sparkContext, "number of output rows")) protected override def doExecute(): RDD[InternalRow] = { val numOutputRows = longMetric("numOutputRows") rdd.mapPartitionsInternal { iter => val proj = UnsafeProjection.create(schema) iter.map { r => numOutputRows += 1 proj(r) } } } override def simpleString: String = { s"Scan $nodeName${Utils.truncatedString(output, "[", ",", "]")}" } }
Example 25
Source File: package.scala From amadou with Apache License 2.0 | 5 votes |
package com.mediative.amadou import com.google.api.services.bigquery.model._ import com.google.cloud.hadoop.fs.gcs.GoogleHadoopFileSystem import com.google.cloud.hadoop.io.bigquery._ import org.apache.hadoop.fs.{FileSystem, Path} import net.ceedubs.ficus.readers.ValueReader import net.ceedubs.ficus.FicusInstances import org.apache.spark.sql.{Dataset, SparkSession, Encoder} import java.util.concurrent.ThreadLocalRandom import scala.collection.JavaConversions._ package object bigquery extends FicusInstances { object CreateDisposition extends Enumeration { val CREATE_IF_NEEDED, CREATE_NEVER = Value } object WriteDisposition extends Enumeration { val WRITE_TRUNCATE, WRITE_APPEND, WRITE_EMPTY = Value } val BQ_CSV_DATE_FORMAT = "yyyy-MM-dd HH:mm:ss zzz" object TableNotFound { import com.google.api.client.googleapis.json.GoogleJsonResponseException import com.google.api.client.googleapis.json.GoogleJsonError import scala.collection.JavaConverters._ def unapply(error: Throwable): Option[GoogleJsonError.ErrorInfo] = error match { case error: GoogleJsonResponseException => Some(error.getDetails) .filter(_.getCode == 404) .flatMap(_.getErrors.asScala.find(_.getReason == "notFound")) case _ => None } } def tableHasDataForDate( spark: SparkSession, table: TableReference, date: java.sql.Date, column: String): Boolean = { val bq = BigQueryClient.getInstance(spark.sparkContext.hadoopConfiguration) bq.hasDataForDate(table, date, column) } def saveAsBigQueryTable( tableRef: TableReference, writeDisposition: WriteDisposition.Value, createDisposition: CreateDisposition.Value): Unit = { val bucket = conf.get(BigQueryConfiguration.GCS_BUCKET_KEY) val temp = s"spark-bigquery-${System.currentTimeMillis()}=${ThreadLocalRandom.current.nextInt(Int.MaxValue)}" val gcsPath = s"gs://$bucket/spark-bigquery-tmp/$temp" self.write.json(gcsPath) val schemaFields = self.schema.fields.map { field => import org.apache.spark.sql.types._ val fieldType = field.dataType match { case BooleanType => "BOOLEAN" case LongType => "INTEGER" case IntegerType => "INTEGER" case StringType => "STRING" case DoubleType => "FLOAT" case TimestampType => "TIMESTAMP" case _: DecimalType => "INTEGER" } new TableFieldSchema().setName(field.name).setType(fieldType) }.toList val tableSchema = new TableSchema().setFields(schemaFields) bq.load(gcsPath, tableRef, tableSchema, writeDisposition, createDisposition) delete(new Path(gcsPath)) } private def delete(path: Path): Unit = { val fs = FileSystem.get(path.toUri, conf) fs.delete(path, true) () } } implicit val valueReader: ValueReader[BigQueryTable.PartitionStrategy] = ValueReader[String].map { _ match { case "month" => BigQueryTable.PartitionByMonth case "day" => BigQueryTable.PartitionByDay case other => sys.error(s"Unknown partition strategy") } } }
Example 26
Source File: GroupSortedDataset.scala From spark-sorted with Apache License 2.0 | 5 votes |
package com.tresata.spark.sorted.sql import scala.reflect.ClassTag import org.apache.spark.sql.{ Column, Dataset, Encoder } import org.apache.spark.sql.functions.col import org.apache.spark.sql.catalyst.encoders.{ encoderFor, ExpressionEncoder } import com.tresata.spark.sorted.{ mapStreamIterator, mapStreamIteratorWithContext, newWCreate } object GroupSortedDataset { private[sql] def apply[K: Encoder, V](dataset: Dataset[(K, V)], numPartitions: Option[Int], reverse: Boolean, sortBy: Column => Column): GroupSortedDataset[K, V] = { val key = col(dataset.columns.head) val valueSort = { val sort = sortBy(col(dataset.columns.last)) if (reverse) sort.desc else sort.asc } new GroupSortedDataset(numPartitions.map(dataset.repartition(_, key)).getOrElse(dataset.repartition(key)).sortWithinPartitions(key, valueSort)) } } class GroupSortedDataset[K: Encoder, V] private (dataset: Dataset[(K, V)]) extends Serializable { def toDS: Dataset[(K, V)] = dataset def mapStreamByKey[W: Encoder, C](c: () => C)(f: (C, Iterator[V]) => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIteratorWithContext(_)(c, f)) } def mapStreamByKey[W: Encoder](f: Iterator[V] => TraversableOnce[W]): Dataset[(K, W)] = { implicit val kwEncoder: Encoder[(K, W)] = ExpressionEncoder.tuple(encoderFor[K], encoderFor[W]) dataset.mapPartitions(mapStreamIterator(_)(f)) } def foldLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(iter => Iterator(iter.foldLeft(wCreate())(f))) } def reduceLeftByKey[W >: V: Encoder](f: (W, V) => W): Dataset[(K, W)] = mapStreamByKey(iter => Iterator(iter.reduceLeft(f))) def scanLeftByKey[W: ClassTag: Encoder](w: W)(f: (W, V) => W): Dataset[(K, W)] = { val wCreate = newWCreate(w) mapStreamByKey(_.scanLeft(wCreate())(f)) } }
Example 27
Source File: ReduceAggregator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { @transient private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 28
Source File: Aggregator.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 29
Source File: typedaggregators.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this((x: IN) => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this((x: IN) => f.call(x).asInstanceOf[Any]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this((x: IN) => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }
Example 30
Source File: cogroup.scala From spark-tools with Apache License 2.0 | 5 votes |
package io.univalence.plumbus import org.apache.spark.Partitioner import org.apache.spark.rdd.{ CoGroupedRDD, RDD } import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.{ ArrayType, StructField } import org.apache.spark.sql.{ types, DataFrame, Dataset, Encoder, KeyValueGroupedDataset, Row } import scala.reflect.ClassTag import scala.util.Try object cogroup { implicit class KVGD[K, A](val kvgd: KeyValueGroupedDataset[K, A]) { def cogroup[B](right: KeyValueGroupedDataset[K, B]): Dataset[(K, Seq[A], Seq[B])] = //Use SparkAddOn ? ??? } def apply[A, B, K](left: Dataset[A], right: Dataset[B])(keyLeft: A => K, keyRight: B => K)( implicit encA: Encoder[A], encB: Encoder[B], encC: Encoder[K], enc: Encoder[(K, Seq[A], Seq[B])], ca: ClassTag[A], ck: ClassTag[K], cb: ClassTag[B] ): Dataset[(K, Seq[A], Seq[B])] = left.sparkSession.implicits .rddToDatasetHolder( RDD .rddToPairRDDFunctions(left.rdd.keyBy(keyLeft)) .cogroup(right.rdd.keyBy(keyRight)) .map({ case (k, (ia, ib)) => (k, ia.toSeq, ib.toSeq) }) ) .toDS def cogroupDf(group: DataFrame, namedSubGroup: (String, DataFrame)*)( byKey: String, partitioner: Partitioner = Partitioner.defaultPartitioner(group.rdd, namedSubGroup.map(_._2.rdd): _*) ): Try[DataFrame] = Try { val subGroup: Seq[DataFrame] = namedSubGroup.map(_._2) val allFrames: Seq[DataFrame] = group +: subGroup val allFramesKeyed: Seq[RDD[(String, Row)]] = allFrames.map(df => { val idx = df.columns.indexOf(byKey) df.rdd.keyBy(_.get(idx).toString) }) val cogroupRdd: CoGroupedRDD[String] = new CoGroupedRDD[String](allFramesKeyed, partitioner) val rowRdd: RDD[Row] = cogroupRdd.map(x => { val rows: Array[Seq[Row]] = x._2.asInstanceOf[Array[Iterable[Row]]].map(_.toSeq) val seq = rows.head.head.toSeq ++ rows.tail new GenericRowWithSchema(seq.toArray, null).asInstanceOf[Row] }) val schema = types.StructType( group.schema.fields ++ namedSubGroup.map { case (name, df) => StructField(name, ArrayType(df.schema)) } ) group.sparkSession.createDataFrame(rowRdd, schema) } }
Example 31
Source File: CassandraStorage.scala From graphsense-transformation with MIT License | 5 votes |
package at.ac.ait.storage import com.datastax.spark.connector.rdd.ValidRDDType import com.datastax.spark.connector.rdd.reader.RowReaderFactory import com.datastax.spark.connector.writer.{RowWriterFactory} import java.time.LocalDateTime import java.time.format.DateTimeFormatter import org.apache.spark.sql.{Dataset, Encoder, SparkSession} import scala.reflect.ClassTag import at.ac.ait.Util._ class CassandraStorage(spark: SparkSession) { import spark.implicits._ import com.datastax.spark.connector._ def load[T <: Product: ClassTag: RowReaderFactory: ValidRDDType: Encoder]( keyspace: String, tableName: String, columns: ColumnRef* ) = { spark.sparkContext.setJobDescription(s"Loading table ${tableName}") val table = spark.sparkContext.cassandraTable[T](keyspace, tableName) if (columns.isEmpty) table.toDS().as[T] else table.select(columns: _*).toDS().as[T] } def store[T <: Product: RowWriterFactory]( keyspace: String, tableName: String, df: Dataset[T] ) = { spark.sparkContext.setJobDescription(s"Writing table ${tableName}") val dtf = DateTimeFormatter.ofPattern("yyyy-MM-dd HH:mm:ss") val timestamp = LocalDateTime.now().format(dtf) println(s"[$timestamp] Writing table ${tableName}") time { df.rdd.saveToCassandra(keyspace, tableName) } } }
Example 32
Source File: package.scala From sparksql-scalapb with Apache License 2.0 | 5 votes |
package scalapb import org.apache.spark.sql.{DataFrame, Encoder, SQLContext, SparkSession} import scala.reflect.ClassTag package object spark { implicit class ProtoSQLContext(val sqlContext: SQLContext) extends AnyVal { def protoToDataFrame[T <: GeneratedMessage: Encoder]( protoRdd: org.apache.spark.rdd.RDD[T] ) = { ProtoSQL.protoToDataFrame(sqlContext, protoRdd) } } implicit class ProtoRDD[T <: GeneratedMessage]( val protoRdd: org.apache.spark.rdd.RDD[T] ) extends AnyVal { def toDataFrame( sqlContext: SQLContext )(implicit encoder: Encoder[T]): DataFrame = { ProtoSQL.protoToDataFrame(sqlContext, protoRdd) } def toDataFrame( sparkSession: SparkSession )(implicit encoder: Encoder[T]): DataFrame = { ProtoSQL.protoToDataFrame(sparkSession, protoRdd) } } }
Example 33
Source File: Encoders.scala From magellan with Apache License 2.0 | 5 votes |
package magellan.encoders import magellan._ import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.analysis.GetColumnByOrdinal import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.catalyst.expressions._ import org.apache.spark.sql.types._ import scala.reflect._ object Encoders { implicit def encoderForPoint: Encoder[Point] = { val sqlType = new PointUDT().sqlType ExpressionEncoder[Point]( schema = sqlType, flat = true, serializer = Seq( MagellanSerializer( BoundReference(0, ObjectType(classOf[Point]), nullable = true), sqlType)), deserializer = MagellanDeserializer( GetColumnByOrdinal(0, sqlType), classOf[Point]), clsTag = classTag[Point]) } implicit def encoderForPolygon: Encoder[Polygon] = { val sqlType = new PolygonUDT().sqlType ExpressionEncoder[Polygon]( schema = sqlType, flat = true, serializer = Seq( MagellanSerializer( BoundReference(0, ObjectType(classOf[Polygon]), nullable = true), sqlType)), deserializer = MagellanDeserializer( GetColumnByOrdinal(0, sqlType), classOf[Polygon]), clsTag = classTag[Polygon]) } implicit def encoderForPolyLine: Encoder[PolyLine] = { val sqlType = new PolyLineUDT().sqlType ExpressionEncoder[PolyLine]( schema = sqlType, flat = true, serializer = Seq( MagellanSerializer( BoundReference(0, ObjectType(classOf[PolyLine]), nullable = true), sqlType)), deserializer = MagellanDeserializer( GetColumnByOrdinal(0, sqlType), classOf[PolyLine]), clsTag = classTag[PolyLine]) } }
Example 34
Source File: CustomSinkSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import scala.collection.mutable.ListBuffer import org.scalatest.FunSuite import org.apache.spark._ import org.apache.spark.sql.{Dataset, DataFrame, Encoder, SQLContext} import org.apache.spark.sql.execution.streaming.MemoryStream class CustomSinkSuite extends FunSuite with DataFrameSuiteBase { test("really simple test of the custom sink") { import spark.implicits._ val input = MemoryStream[String] val doubled = input.toDS().map(x => x + " " + x) val formatName = ("com.highperformancespark.examples" + "structuredstreaming.CustomSinkCollectorProvider") val query = doubled.writeStream .queryName("testCustomSinkBasic") .format(formatName) .start() val inputData = List("hi", "holden", "bye", "pandas") input.addData(inputData) assert(query.isActive === true) query.processAllAvailable() assert(query.exception === None) assert(Pandas.results(0) === inputData.map(x => x + " " + x)) } } object Pandas{ val results = new ListBuffer[Seq[String]]() } class CustomSinkCollectorProvider extends ForeachDatasetSinkProvider { override def func(df: DataFrame) { val spark = df.sparkSession import spark.implicits._ Pandas.results += df.as[String].rdd.collect() } }
Example 35
Source File: DatasetGenerator.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Dataset, Encoder, SQLContext} import org.scalacheck.{Arbitrary, Gen} import scala.reflect.ClassTag object DatasetGenerator { def arbitrarySizedDataset[T: ClassTag : Encoder] (sqlCtx: SQLContext, minPartitions: Int = 1) (generator: Int => Gen[T]): Arbitrary[Dataset[T]] = { val rddGen: Gen[RDD[T]] = RDDGenerator.genSizedRDD[T](sqlCtx.sparkContext, minPartitions)(generator) val datasetGen: Gen[Dataset[T]] = rddGen.map(rdd => sqlCtx.createDataset(rdd)) Arbitrary { datasetGen } } }
Example 36
Source File: ReduceAggregator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.sql.Encoder import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder private[sql] class ReduceAggregator[T: Encoder](func: (T, T) => T) extends Aggregator[T, (Boolean, T), T] { private val encoder = implicitly[Encoder[T]] override def zero: (Boolean, T) = (false, null.asInstanceOf[T]) override def bufferEncoder: Encoder[(Boolean, T)] = ExpressionEncoder.tuple( ExpressionEncoder[Boolean](), encoder.asInstanceOf[ExpressionEncoder[T]]) override def outputEncoder: Encoder[T] = encoder override def reduce(b: (Boolean, T), a: T): (Boolean, T) = { if (b._1) { (true, func(b._2, a)) } else { (true, a) } } override def merge(b1: (Boolean, T), b2: (Boolean, T)): (Boolean, T) = { if (!b1._1) { b2 } else if (!b2._1) { b1 } else { (true, func(b1._2, b2._2)) } } override def finish(reduction: (Boolean, T)): T = { if (!reduction._1) { throw new IllegalStateException("ReduceAggregator requires at least one input row") } reduction._2 } }
Example 37
Source File: Aggregator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.expressions import org.apache.spark.annotation.{Experimental, InterfaceStability} import org.apache.spark.sql.{Dataset, Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.encoderFor import org.apache.spark.sql.catalyst.expressions.aggregate.{AggregateExpression, Complete} import org.apache.spark.sql.execution.aggregate.TypedAggregateExpression def toColumn: TypedColumn[IN, OUT] = { implicit val bEncoder = bufferEncoder implicit val cEncoder = outputEncoder val expr = AggregateExpression( TypedAggregateExpression(this), Complete, isDistinct = false) new TypedColumn[IN, OUT](expr, encoderFor[OUT]) } }
Example 38
Source File: typedaggregators.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.aggregate import org.apache.spark.api.java.function.MapFunction import org.apache.spark.sql.{Encoder, TypedColumn} import org.apache.spark.sql.catalyst.encoders.ExpressionEncoder import org.apache.spark.sql.expressions.Aggregator //////////////////////////////////////////////////////////////////////////////////////////////////// // This file defines internal implementations for aggregators. //////////////////////////////////////////////////////////////////////////////////////////////////// class TypedSumDouble[IN](val f: IN => Double) extends Aggregator[IN, Double, Double] { override def zero: Double = 0.0 override def reduce(b: Double, a: IN): Double = b + f(a) override def merge(b1: Double, b2: Double): Double = b1 + b2 override def finish(reduction: Double): Double = reduction override def bufferEncoder: Encoder[Double] = ExpressionEncoder[Double]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } } class TypedSumLong[IN](val f: IN => Long) extends Aggregator[IN, Long, Long] { override def zero: Long = 0L override def reduce(b: Long, a: IN): Long = b + f(a) override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, java.lang.Long]) = this(x => f.call(x).asInstanceOf[Long]) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedCount[IN](val f: IN => Any) extends Aggregator[IN, Long, Long] { override def zero: Long = 0 override def reduce(b: Long, a: IN): Long = { if (f(a) == null) b else b + 1 } override def merge(b1: Long, b2: Long): Long = b1 + b2 override def finish(reduction: Long): Long = reduction override def bufferEncoder: Encoder[Long] = ExpressionEncoder[Long]() override def outputEncoder: Encoder[Long] = ExpressionEncoder[Long]() // Java api support def this(f: MapFunction[IN, Object]) = this(x => f.call(x)) def toColumnJava: TypedColumn[IN, java.lang.Long] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Long]] } } class TypedAverage[IN](val f: IN => Double) extends Aggregator[IN, (Double, Long), Double] { override def zero: (Double, Long) = (0.0, 0L) override def reduce(b: (Double, Long), a: IN): (Double, Long) = (f(a) + b._1, 1 + b._2) override def finish(reduction: (Double, Long)): Double = reduction._1 / reduction._2 override def merge(b1: (Double, Long), b2: (Double, Long)): (Double, Long) = { (b1._1 + b2._1, b1._2 + b2._2) } override def bufferEncoder: Encoder[(Double, Long)] = ExpressionEncoder[(Double, Long)]() override def outputEncoder: Encoder[Double] = ExpressionEncoder[Double]() // Java api support def this(f: MapFunction[IN, java.lang.Double]) = this(x => f.call(x).asInstanceOf[Double]) def toColumnJava: TypedColumn[IN, java.lang.Double] = { toColumn.asInstanceOf[TypedColumn[IN, java.lang.Double]] } }