org.apache.spark.sql.expressions.UserDefinedAggregateFunction Scala Examples
The following examples show how to use org.apache.spark.sql.expressions.UserDefinedAggregateFunction.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: TemporalUdafs.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.temporal import org.apache.logging.log4j.scala.Logging import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types.{CalendarIntervalType, DataType, LongType, StructField, StructType} import org.apache.spark.unsafe.types.CalendarInterval import org.opencypher.okapi.impl.temporal.TemporalConstants import org.opencypher.morpheus.impl.temporal.TemporalConversions._ object TemporalUdafs extends Logging { abstract class SimpleDurationAggregation(aggrName: String) extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(Array(StructField("duration", CalendarIntervalType))) override def bufferSchema: StructType = StructType(Array(StructField(aggrName, CalendarIntervalType))) override def dataType: DataType = CalendarIntervalType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(0, 0L) } override def evaluate(buffer: Row): Any = buffer.getAs[CalendarInterval](0) } class DurationSum extends SimpleDurationAggregation("sum") { override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0) = buffer.getAs[CalendarInterval](0).add(input.getAs[CalendarInterval](0)) } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer2.getAs[CalendarInterval](0).add(buffer1.getAs[CalendarInterval](0)) } } class DurationMax extends SimpleDurationAggregation("max") { override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { val currMaxInterval = buffer.getAs[CalendarInterval](0) val inputInterval = input.getAs[CalendarInterval](0) buffer(0) = if (currMaxInterval.toDuration.compare(inputInterval.toDuration) >= 0) currMaxInterval else inputInterval } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { val interval1 = buffer1.getAs[CalendarInterval](0) val interval2 = buffer2.getAs[CalendarInterval](0) buffer1(0) = if (interval1.toDuration.compare(interval2.toDuration) >= 0) interval1 else interval2 } } class DurationMin extends SimpleDurationAggregation("min") { override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(Integer.MAX_VALUE, Long.MaxValue) } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { val currMinInterval = buffer.getAs[CalendarInterval](0) val inputInterval = input.getAs[CalendarInterval](0) buffer(0) = if (inputInterval.toDuration.compare(currMinInterval.toDuration) >= 0) currMinInterval else inputInterval } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { val interval1 = buffer1.getAs[CalendarInterval](0) val interval2 = buffer2.getAs[CalendarInterval](0) buffer1(0) = if (interval2.toDuration.compare(interval1.toDuration) >= 0) interval1 else interval2 } } class DurationAvg extends UserDefinedAggregateFunction { override def inputSchema: StructType = StructType(Array(StructField("duration", CalendarIntervalType))) override def bufferSchema: StructType = StructType(Array(StructField("sum", CalendarIntervalType), StructField("cnt", LongType))) override def dataType: DataType = CalendarIntervalType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = new CalendarInterval(0, 0L) buffer(1) = 0L } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { buffer(0) = buffer.getAs[CalendarInterval](0).add(input.getAs[CalendarInterval](0)) buffer(1) = buffer.getLong(1) + 1 } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer2.getAs[CalendarInterval](0).add(buffer1.getAs[CalendarInterval](0)) buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1) } override def evaluate(buffer: Row): Any = { val sumInterval = buffer.getAs[CalendarInterval](0) val cnt = buffer.getLong(1) new CalendarInterval((sumInterval.months / cnt).toInt, sumInterval.microseconds / cnt) } } val durationSum = new DurationSum() val durationAvg = new DurationAvg() val durationMin = new DurationMin() val durationMax = new DurationMax() }
Example 2
Source File: PercentileUdafs.scala From morpheus with Apache License 2.0 | 5 votes |
package org.opencypher.morpheus.impl.expressions import org.apache.logging.log4j.scala.Logging import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ import org.opencypher.okapi.impl.exception.IllegalArgumentException import scala.collection.mutable // As abs(percentile_rank() - given_percentage) inside min() is not allowed object PercentileUdafs extends Logging { abstract class PercentileAggregation(percentile: Double) extends UserDefinedAggregateFunction { def inputSchema: StructType = StructType(Array(StructField("value", DoubleType))) def bufferSchema: StructType = StructType(Array(StructField("array_buffer", ArrayType(DoubleType, containsNull = false)))) def deterministic: Boolean = true def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = Array[DoubleType]() } def update(buffer: MutableAggregationBuffer, input: Row): Unit = { if (input(0) != null) { buffer(0) = buffer(0).asInstanceOf[mutable.WrappedArray[DoubleType]] :+ input(0) } } def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer1(0).asInstanceOf[mutable.WrappedArray[Double]] ++ buffer2(0).asInstanceOf[mutable.WrappedArray[Double]] } } class PercentileDisc(percentile: Double, numberType: DataType) extends PercentileAggregation(percentile) { def dataType: DataType = numberType def evaluate(buffer: Row): Any = { val sortedValues = buffer(0).asInstanceOf[mutable.WrappedArray[Double]].sortWith(_ < _) if (sortedValues.isEmpty) return null val position = (sortedValues.length * percentile).round.toInt val result = if (position == 0) sortedValues(0) else sortedValues(position - 1) dataType match { case LongType => result.toLong case DoubleType => result case e => throw IllegalArgumentException("a Integer or a Float", e) } } } class PercentileCont(percentile: Double) extends PercentileAggregation(percentile) { def dataType: DataType = DoubleType def evaluate(buffer: Row): Any = { val sortedValues = buffer(0).asInstanceOf[mutable.WrappedArray[Double]].sortWith(_ < _) if (sortedValues.isEmpty) return null val exact_position = 1 + ((sortedValues.length - 1) * percentile) val prec = exact_position.floor.toInt val succ = exact_position.ceil.toInt val weight = succ - exact_position exact_position match { case pos if pos < 1 => (1 - weight) * sortedValues(succ) + weight * sortedValues(prec) case pos if pos == succ => sortedValues(prec - 1) case _ => (1 - weight) * sortedValues(succ - 1) + weight * sortedValues(prec - 1) } } } def percentileDisc(percentile: Double, numberType: DataType) = new PercentileDisc(percentile, numberType: DataType) def percentileCont(percentile: Double) = new PercentileCont(percentile) }
Example 3
Source File: StatefulKLLSketch.scala From deequ with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import java.nio.ByteBuffer import com.amazon.deequ.analyzers.QuantileNonSample import com.amazon.deequ.analyzers.catalyst.KLLSketchSerializer import com.google.common.primitives.Doubles import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ private [sql] class StatefulKLLSketch( sketchSize: Int, shrinkingFactor: Double) extends UserDefinedAggregateFunction{ val OBJECT_POS = 0 val MIN_POS = 1 val MAX_POS = 2 override def inputSchema: StructType = StructType(StructField("value", DoubleType) :: Nil) override def bufferSchema: StructType = StructType(StructField("data", BinaryType) :: StructField("minimum", DoubleType) :: StructField("maximum", DoubleType) :: Nil) override def dataType: DataType = BinaryType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { val qsketch = new QuantileNonSample[Double](sketchSize, shrinkingFactor) buffer(OBJECT_POS) = serialize(qsketch) buffer(MIN_POS) = Int.MaxValue.toDouble buffer(MAX_POS) = Int.MinValue.toDouble } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { if (input.isNullAt(OBJECT_POS)) { return } val tmp = input.getDouble(OBJECT_POS) val kll = deserialize(buffer.getAs[Array[Byte]](OBJECT_POS)) kll.update(tmp) buffer(OBJECT_POS) = serialize(kll) buffer(MIN_POS) = Math.min(buffer.getDouble(MIN_POS), tmp) buffer(MAX_POS) = Math.max(buffer.getDouble(MAX_POS), tmp) } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { if (buffer2.isNullAt(OBJECT_POS)) { return } val kll_this = deserialize(buffer1.getAs[Array[Byte]](OBJECT_POS)) val kll_other = deserialize(buffer2.getAs[Array[Byte]](OBJECT_POS)) val kll_ret = kll_this.merge(kll_other) buffer1(OBJECT_POS) = serialize(kll_ret) buffer1(MIN_POS) = Math.min(buffer1.getDouble(MIN_POS), buffer2.getDouble(MIN_POS)) buffer1(MAX_POS) = Math.max(buffer1.getDouble(MAX_POS), buffer2.getDouble(MAX_POS)) } override def evaluate(buffer: Row): Any = { toBytes(buffer.getDouble(MIN_POS), buffer.getDouble(MAX_POS), buffer.getAs[Array[Byte]](OBJECT_POS)) } def toBytes(min: Double, max: Double, obj: Array[Byte]): Array[Byte] = { val buffer2 = ByteBuffer.wrap(new Array(Doubles.BYTES + Doubles.BYTES + obj.length)) buffer2.putDouble(min) buffer2.putDouble(max) buffer2.put(obj) buffer2.array() } def serialize(obj: QuantileNonSample[Double]): Array[Byte] = { KLLSketchSerializer.serializer.serialize(obj) } def deserialize(bytes: Array[Byte]): QuantileNonSample[Double] = { KLLSketchSerializer.serializer.deserialize(bytes) } }
Example 4
Source File: StatefulDataType.scala From deequ with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import com.amazon.deequ.analyzers.DataTypeHistogram import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ import scala.util.matching.Regex private[sql] class StatefulDataType extends UserDefinedAggregateFunction { val SIZE_IN_BYTES = 40 val NULL_POS = 0 val FRACTIONAL_POS = 1 val INTEGRAL_POS = 2 val BOOLEAN_POS = 3 val STRING_POS = 4 val FRACTIONAL: Regex = """^(-|\+)? ?\d*\.\d*$""".r val INTEGRAL: Regex = """^(-|\+)? ?\d*$""".r val BOOLEAN: Regex = """^(true|false)$""".r override def inputSchema: StructType = StructType(StructField("value", StringType) :: Nil) override def bufferSchema: StructType = StructType(StructField("null", LongType) :: StructField("fractional", LongType) :: StructField("integral", LongType) :: StructField("boolean", LongType) :: StructField("string", LongType) :: Nil) override def dataType: types.DataType = BinaryType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(NULL_POS) = 0L buffer(FRACTIONAL_POS) = 0L buffer(INTEGRAL_POS) = 0L buffer(BOOLEAN_POS) = 0L buffer(STRING_POS) = 0L } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { if (input.isNullAt(0)) { buffer(NULL_POS) = buffer.getLong(NULL_POS) + 1L } else { input.getString(0) match { case FRACTIONAL(_) => buffer(FRACTIONAL_POS) = buffer.getLong(FRACTIONAL_POS) + 1L case INTEGRAL(_) => buffer(INTEGRAL_POS) = buffer.getLong(INTEGRAL_POS) + 1L case BOOLEAN(_) => buffer(BOOLEAN_POS) = buffer.getLong(BOOLEAN_POS) + 1L case _ => buffer(STRING_POS) = buffer.getLong(STRING_POS) + 1L } } } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(NULL_POS) = buffer1.getLong(NULL_POS) + buffer2.getLong(NULL_POS) buffer1(FRACTIONAL_POS) = buffer1.getLong(FRACTIONAL_POS) + buffer2.getLong(FRACTIONAL_POS) buffer1(INTEGRAL_POS) = buffer1.getLong(INTEGRAL_POS) + buffer2.getLong(INTEGRAL_POS) buffer1(BOOLEAN_POS) = buffer1.getLong(BOOLEAN_POS) + buffer2.getLong(BOOLEAN_POS) buffer1(STRING_POS) = buffer1.getLong(STRING_POS) + buffer2.getLong(STRING_POS) } override def evaluate(buffer: Row): Any = { DataTypeHistogram.toBytes(buffer.getLong(NULL_POS), buffer.getLong(FRACTIONAL_POS), buffer.getLong(INTEGRAL_POS), buffer.getLong(BOOLEAN_POS), buffer.getLong(STRING_POS)) } }
Example 5
Source File: A_8_MyAverage.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.sql import org.apache.spark.sql.{Row, SparkSession} import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ object A_8_MyAverage extends UserDefinedAggregateFunction{ override def inputSchema: StructType = StructType(StructField("inputColumn",LongType)::Nil) override def bufferSchema: StructType = { StructType(StructField("sum",LongType)::StructField("count",LongType)::Nil) } override def dataType: DataType = DoubleType override def deterministic: Boolean = true override def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0)= 0l buffer(1)= 0l } override def update(buffer: MutableAggregationBuffer, input: Row): Unit = { if (!input.isNullAt(0)){ buffer(0) = buffer.getLong(0) + input.getLong(0) buffer(1) = buffer.getLong(1) + 1 } } override def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { buffer1(0) = buffer1.getLong(0) + buffer2.getLong(0) buffer1(1) = buffer1.getLong(1) + buffer2.getLong(1) } override def evaluate(buffer: Row): Any = buffer.getLong(0).toDouble / buffer.getLong(1) def main(args: Array[String]): Unit = { val sparkSession = SparkSession.builder().appName("A_8_MyAverage") .master("local") .getOrCreate() sparkSession.udf.register("A_8_MyAverage",A_8_MyAverage) val dataFrame = sparkSession.read.json("src/main/resources/sparkresource/employees.json") dataFrame.createOrReplaceTempView("employees") val result = sparkSession.sql("select A_8_MyAverage(salary) as average_salary from employees") result.show() } }
Example 6
Source File: HyperLogLogMerge.scala From spark-hyperloglog with Apache License 2.0 | 4 votes |
package com.mozilla.spark.sql.hyperloglog.aggregates import com.twitter.algebird.{Bytes, DenseHLL, HyperLogLog} import org.apache.spark.sql.Row import org.apache.spark.sql.expressions.{MutableAggregationBuffer, UserDefinedAggregateFunction} import org.apache.spark.sql.types._ class HyperLogLogMerge extends UserDefinedAggregateFunction { def inputSchema: org.apache.spark.sql.types.StructType = StructType(StructField("value", BinaryType) :: Nil) def bufferSchema: StructType = StructType(StructField("count", BinaryType) :: StructField("bits", IntegerType) :: Nil) def dataType: DataType = BinaryType def deterministic: Boolean = true def initialize(buffer: MutableAggregationBuffer): Unit = { buffer(0) = null buffer(1) = 0 } def update(buffer: MutableAggregationBuffer, input: Row): Unit = { val hll = HyperLogLog.fromBytes(input.getAs[Array[Byte]](0)).toDenseHLL if (buffer(0) != null) { hll.updateInto(buffer.getAs[Array[Byte]](0)) } else { buffer(0) = hll.v.array buffer(1) = hll.bits } } def merge(buffer1: MutableAggregationBuffer, buffer2: Row): Unit = { if (buffer1(0) == null) { buffer1(0) = buffer2(0) buffer1(1) = buffer2(1) } else if (buffer1(0) != null && buffer2(0) != null) { val state2 = new DenseHLL(buffer2.getAs[Int](1), new Bytes(buffer2.getAs[Array[Byte]](0))) state2.updateInto(buffer1.getAs[Array[Byte]](0)) } } def evaluate(buffer: Row): Any = { val state = new DenseHLL(buffer.getAs[Int](1), new Bytes(buffer.getAs[Array[Byte]](0))) com.twitter.algebird.HyperLogLog.toBytes(state) } }