org.apache.spark.sql.functions.sum Scala Examples
The following examples show how to use org.apache.spark.sql.functions.sum.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: FlintTestData.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.functions.{ udf, sum } import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.percent_rank trait FlintTestData { protected def sqlContext: SQLContext private object internalImplicits extends SQLImplicits { override protected def _sqlContext: SQLContext = sqlContext } import internalImplicits._ import FlintTestData._ protected lazy val testData: DataFrame = { val df = sqlContext.sparkContext.parallelize( (0 to 97).map(i => TestData(i.toLong, i.toDouble)) ).toDF() df } protected lazy val testData2: DataFrame = { val df = sqlContext.sparkContext.parallelize( (0 to 101).map(i => TestData2(i.toLong, i.toDouble, -i.toDouble)) ).toDF() df } protected lazy val testDataCached: DataFrame = { val df = DFConverter.newDataFrame(testData) df.cache df.count df } protected val withTime2Column = { df: DataFrame => df.withColumn("time2", df("time") * 2) } protected val withTime3ColumnUdf = { df: DataFrame => val testUdf = udf({ time: Long => time * 2 }) df.withColumn("time3", testUdf(df("time"))) } protected val selectV = { df: DataFrame => df.select("v") } protected val selectExprVPlusOne = { df: DataFrame => df.selectExpr("v + 1 as v") } protected val filterV = { df: DataFrame => df.filter(df("v") > 0) } protected val orderByTime = { df: DataFrame => df.orderBy("time") } protected val orderByV = { df: DataFrame => df.orderBy("v") } protected val addRankColumn = { df: DataFrame => df.withColumn("rank", percent_rank().over(Window.partitionBy("time").orderBy("v"))) } protected val selectSumV = { df: DataFrame => df.select(sum("v")) } protected val selectExprSumV = { df: DataFrame => df.selectExpr("sum(v)") } protected val groupByTimeSumV = { df: DataFrame => df.groupBy("time").agg(sum("v").alias("v")) } protected val repartition = { df: DataFrame => df.repartition(10) } protected val coalesce = { df: DataFrame => df.coalesce(5) } protected val cache = { df: DataFrame => df.cache(); df.count(); df } protected val unpersist = { df: DataFrame => df.unpersist() } } object FlintTestData { case class TestData(time: Long, v: Double) case class TestData2(time: Long, v: Double, v2: Double) }
Example 2
Source File: StructuredStreamingWordCount.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import com.datastax.driver.core.Cluster import knolx.Config._ import knolx.KnolXLogger import knolx.spark.CassandraForeachWriter.writeToCassandra import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.types.StringType object StructuredStreamingWordCount extends App with KnolXLogger { val cluster = Cluster.builder.addContactPoints(cassandraHosts).build val session = cluster.newSession() info("Creating Keypsace and tables in Cassandra...") session.execute(s"CREATE KEYSPACE IF NOT EXISTS $keyspace WITH " + "replication = {'class':'SimpleStrategy','replication_factor':1};") session.execute(s"CREATE TABLE IF NOT EXISTS $keyspace.wordcount ( word text PRIMARY KEY,count int );") info("Closing DB connection...") session.close() session.getCluster.close() info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Creating Streaming DF...") val dataStream = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", topic) .load() info("Writing data to Cassandra...") val query = dataStream .select(col("value").cast(StringType).as("word"), lit(1).as("count")) .groupBy(col("word")) .agg(sum("count").as("count")) .writeStream .outputMode(OutputMode.Update()) .foreach(writeToCassandra) .option("checkpointLocation", checkPointDir) .start() info("Waiting for the query to terminate...") query.awaitTermination() query.stop() }
Example 3
Source File: Mean.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{count, sum} import org.apache.spark.sql.types.{DoubleType, StructType, LongType} import Analyzers._ case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] { override def sum(other: MeanState): MeanState = { MeanState(sum + other.sum, count + other.count) } override def metricValue(): Double = { if (count == 0L) Double.NaN else sum / count } } case class Mean(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MeanState]("Mean", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { sum(conditionalSelection(column, where)).cast(DoubleType) :: count(conditionalSelection(column, where)).cast(LongType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = { ifNoNullsIn(result, offset, howMany = 2) { _ => MeanState(result.getDouble(offset), result.getLong(offset + 1)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 4
Source File: UniqueValueRatio.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import com.amazon.deequ.metrics.DoubleMetric import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{col, count, lit, sum} import org.apache.spark.sql.types.DoubleType case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil } override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { val numUniqueValues = result.getDouble(offset) val numDistinctValues = result.getLong(offset + 1).toDouble toSuccessMetric(numUniqueValues / numDistinctValues) } override def filterCondition: Option[String] = where } object UniqueValueRatio { def apply(column: String): UniqueValueRatio = { new UniqueValueRatio(column :: Nil) } def apply(column: String, where: Option[String]): UniqueValueRatio = { new UniqueValueRatio(column :: Nil, where) } }
Example 5
Source File: MutualInformation.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers._ import com.amazon.deequ.metrics.{DoubleMetric, Entity} import org.apache.spark.sql.functions.{col, sum, udf} import org.apache.spark.sql.types.StructType import Analyzers.COUNT_COL import com.amazon.deequ.analyzers.runners.MetricCalculationException override def preconditions: Seq[StructType => Unit] = { Preconditions.exactlyNColumns(columns, 2) +: super.preconditions } override def toFailureMetric(exception: Exception): DoubleMetric = { metricFromFailure(exception, "MutualInformation", columns.mkString(","), Entity.Mutlicolumn) } override def filterCondition: Option[String] = where } object MutualInformation { def apply(columnA: String, columnB: String): MutualInformation = { new MutualInformation(columnA :: columnB :: Nil) } }
Example 6
Source File: Entropy.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, sum, udf} case class Entropy(column: String, where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Entropy", column :: Nil) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { val summands = udf { (count: Double) => if (count == 0.0) { 0.0 } else { -(count / numRows) * math.log(count / numRows) } } sum(summands(col(COUNT_COL))) :: Nil } override def filterCondition: Option[String] = where }
Example 7
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.functions.sum import org.apache.spark.sql.types.{DoubleType, StructType} import org.apache.spark.sql.{Column, Row} import Analyzers._ case class SumState(sum: Double) extends DoubleValuedState[SumState] { override def sum(other: SumState): SumState = { SumState(sum + other.sum) } override def metricValue(): Double = { sum } } case class Sum(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[SumState]("Sum", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { sum(conditionalSelection(column, where)).cast(DoubleType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[SumState] = { ifNoNullsIn(result, offset) { _ => SumState(result.getDouble(offset)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 8
Source File: Uniqueness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.Column import org.apache.spark.sql.functions.{col, lit, sum} import org.apache.spark.sql.types.DoubleType case class Uniqueness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Uniqueness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Uniqueness { def apply(column: String): Uniqueness = { new Uniqueness(column :: Nil) } def apply(column: String, where: Option[String]): Uniqueness = { new Uniqueness(column :: Nil, where) } }
Example 9
Source File: Distinctness.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import org.apache.spark.sql.functions.{col, sum} import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.Column case class Distinctness(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("Distinctness", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { (sum(col(COUNT_COL).geq(1).cast(DoubleType)) / numRows) :: Nil } override def filterCondition: Option[String] = where } object Distinctness { def apply(column: String): Distinctness = { new Distinctness(column :: Nil) } }