org.apache.spark.sql.functions.count Scala Examples
The following examples show how to use org.apache.spark.sql.functions.count.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MicroBatchExecutionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.functions.{count, window} import org.apache.spark.sql.streaming.StreamTest class MicroBatchExecutionSuite extends StreamTest with BeforeAndAfter { import testImplicits._ after { sqlContext.streams.active.foreach(_.stop()) } test("SPARK-24156: do not plan a no-data batch again after it has already been planned") { val inputData = MemoryStream[Int] val df = inputData.toDF() .withColumn("eventTime", $"value".cast("timestamp")) .withWatermark("eventTime", "10 seconds") .groupBy(window($"eventTime", "5 seconds") as 'window) .agg(count("*") as 'count) .select($"window".getField("start").cast("long").as[Long], $"count".as[Long]) testStream(df)( AddData(inputData, 10, 11, 12, 13, 14, 15), // Set watermark to 5 CheckAnswer(), AddData(inputData, 25), // Set watermark to 15 to make MicroBatchExecution run no-data batch CheckAnswer((10, 5)), // Last batch should be a no-data batch StopStream, Execute { q => // Delete the last committed batch from the commit log to signify that the last batch // (a no-data batch) never completed val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) q.commitLog.purgeAfter(commit - 1) }, // Add data before start so that MicroBatchExecution can plan a batch. It should not, // it should first re-run the incomplete no-data batch and then run a new batch to process // new data. AddData(inputData, 30), StartStream(), CheckNewAnswer((15, 1)), // This should not throw the error reported in SPARK-24156 StopStream, Execute { q => // Delete the entire commit log val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) q.commitLog.purge(commit + 1) }, AddData(inputData, 50), StartStream(), CheckNewAnswer((25, 1), (30, 1)) // This should not throw the error reported in SPARK-24156 ) } }
Example 2
Source File: WholeStageCodegenSparkSubmitSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.scalatest.{Assertions, BeforeAndAfterEach, Matchers} import org.scalatest.concurrent.TimeLimits import org.apache.spark.{SparkFunSuite, TestUtils} import org.apache.spark.deploy.SparkSubmitSuite import org.apache.spark.internal.Logging import org.apache.spark.sql.{LocalSparkSession, QueryTest, Row, SparkSession} import org.apache.spark.sql.functions.{array, col, count, lit} import org.apache.spark.sql.types.IntegerType import org.apache.spark.unsafe.Platform import org.apache.spark.util.ResetSystemProperties // Due to the need to set driver's extraJavaOptions, this test needs to use actual SparkSubmit. class WholeStageCodegenSparkSubmitSuite extends SparkFunSuite with Matchers with BeforeAndAfterEach with ResetSystemProperties { test("Generated code on driver should not embed platform-specific constant") { val unusedJar = TestUtils.createJarWithClasses(Seq.empty) // HotSpot JVM specific: Set up a local cluster with the driver/executor using mismatched // settings of UseCompressedOops JVM option. val argsForSparkSubmit = Seq( "--class", WholeStageCodegenSparkSubmitSuite.getClass.getName.stripSuffix("$"), "--master", "local-cluster[1,1,1024]", "--driver-memory", "1g", "--conf", "spark.ui.enabled=false", "--conf", "spark.master.rest.enabled=false", "--conf", "spark.driver.extraJavaOptions=-XX:-UseCompressedOops", "--conf", "spark.executor.extraJavaOptions=-XX:+UseCompressedOops", unusedJar.toString) SparkSubmitSuite.runSparkSubmit(argsForSparkSubmit, "../..") } } object WholeStageCodegenSparkSubmitSuite extends Assertions with Logging { var spark: SparkSession = _ def main(args: Array[String]): Unit = { TestUtils.configTestLog4j("INFO") spark = SparkSession.builder().getOrCreate() // Make sure the test is run where the driver and the executors uses different object layouts val driverArrayHeaderSize = Platform.BYTE_ARRAY_OFFSET val executorArrayHeaderSize = spark.sparkContext.range(0, 1).map(_ => Platform.BYTE_ARRAY_OFFSET).collect.head.toInt assert(driverArrayHeaderSize > executorArrayHeaderSize) val df = spark.range(71773).select((col("id") % lit(10)).cast(IntegerType) as "v") .groupBy(array(col("v"))).agg(count(col("*"))) val plan = df.queryExecution.executedPlan assert(plan.find(_.isInstanceOf[WholeStageCodegenExec]).isDefined) val expectedAnswer = Row(Array(0), 7178) :: Row(Array(1), 7178) :: Row(Array(2), 7178) :: Row(Array(3), 7177) :: Row(Array(4), 7177) :: Row(Array(5), 7177) :: Row(Array(6), 7177) :: Row(Array(7), 7177) :: Row(Array(8), 7177) :: Row(Array(9), 7177) :: Nil val result = df.collect QueryTest.sameRows(result.toSeq, expectedAnswer) match { case Some(errMsg) => fail(errMsg) case _ => } } }
Example 3
Source File: TestUtils.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.utils import org.apache.spark.sql.functions.{col, count, lit} import org.apache.spark.sql.{DataFrame, Row} object TestUtils { implicit class ExtendedDataFrame(df: DataFrame) { def hasDiff(anotherDf: DataFrame): Boolean = { def printDiff(incoming: Boolean)(row: Row): Unit = { if (incoming) print("+ ") else print("- ") println(row) } val groupedDf = df.groupBy(df.columns.map(col): _*).agg(count(lit(1))).collect().toSet val groupedAnotherDf = anotherDf.groupBy(anotherDf.columns.map(col): _*).agg(count(lit(1))).collect().toSet groupedDf.diff(groupedAnotherDf).foreach(printDiff(incoming = true)) groupedAnotherDf.diff(groupedDf).foreach(printDiff(incoming = false)) groupedDf.diff(groupedAnotherDf).nonEmpty || groupedAnotherDf.diff(groupedDf).nonEmpty } } }
Example 4
Source File: Mean.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Preconditions.{hasColumn, isNumeric} import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{count, sum} import org.apache.spark.sql.types.{DoubleType, StructType, LongType} import Analyzers._ case class MeanState(sum: Double, count: Long) extends DoubleValuedState[MeanState] { override def sum(other: MeanState): MeanState = { MeanState(sum + other.sum, count + other.count) } override def metricValue(): Double = { if (count == 0L) Double.NaN else sum / count } } case class Mean(column: String, where: Option[String] = None) extends StandardScanShareableAnalyzer[MeanState]("Mean", column) with FilterableAnalyzer { override def aggregationFunctions(): Seq[Column] = { sum(conditionalSelection(column, where)).cast(DoubleType) :: count(conditionalSelection(column, where)).cast(LongType) :: Nil } override def fromAggregationResult(result: Row, offset: Int): Option[MeanState] = { ifNoNullsIn(result, offset, howMany = 2) { _ => MeanState(result.getDouble(offset), result.getLong(offset + 1)) } } override protected def additionalPreconditions(): Seq[StructType => Unit] = { hasColumn(column) :: isNumeric(column) :: Nil } override def filterCondition: Option[String] = where }
Example 5
Source File: UniqueValueRatio.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.analyzers.Analyzers.COUNT_COL import com.amazon.deequ.metrics.DoubleMetric import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.{col, count, lit, sum} import org.apache.spark.sql.types.DoubleType case class UniqueValueRatio(columns: Seq[String], where: Option[String] = None) extends ScanShareableFrequencyBasedAnalyzer("UniqueValueRatio", columns) with FilterableAnalyzer { override def aggregationFunctions(numRows: Long): Seq[Column] = { sum(col(COUNT_COL).equalTo(lit(1)).cast(DoubleType)) :: count("*") :: Nil } override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { val numUniqueValues = result.getDouble(offset) val numDistinctValues = result.getLong(offset + 1).toDouble toSuccessMetric(numUniqueValues / numDistinctValues) } override def filterCondition: Option[String] = where } object UniqueValueRatio { def apply(column: String): UniqueValueRatio = { new UniqueValueRatio(column :: Nil) } def apply(column: String, where: Option[String]): UniqueValueRatio = { new UniqueValueRatio(column :: Nil, where) } }
Example 6
Source File: CountDistinct.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.analyzers import com.amazon.deequ.metrics.DoubleMetric import org.apache.spark.sql.{Column, Row} import org.apache.spark.sql.functions.count import Analyzers._ case class CountDistinct(columns: Seq[String]) extends ScanShareableFrequencyBasedAnalyzer("CountDistinct", columns) { override def aggregationFunctions(numRows: Long): Seq[Column] = { count("*") :: Nil } override def fromAggregationResult(result: Row, offset: Int): DoubleMetric = { toSuccessMetric(result.getLong(offset).toDouble) } } object CountDistinct { def apply(column: String): CountDistinct = { new CountDistinct(column :: Nil) } }
Example 7
Source File: NumberOfRowsConstraint.scala From drunken-data-quality with Apache License 2.0 | 5 votes |
package de.frosner.ddq.constraints import org.apache.spark.sql.functions.count import org.apache.spark.sql.{Column, DataFrame} case class NumberOfRowsConstraint private[ddq] (expected: Column) extends Constraint { val fun = (df: DataFrame) => { val countDf = df.agg(count(new Column("*")).as(NumberOfRowsConstraint.countKey)) val actual = countDf.collect().map(_.getLong(0)).apply(0) val satisfied = countDf.select(expected).collect().map(_.getBoolean(0)).apply(0) NumberOfRowsConstraintResult( constraint = this, actual = actual, status = if (satisfied) ConstraintSuccess else ConstraintFailure ) } } object NumberOfRowsConstraint { private[constraints] val countKey: String = "count" def apply(expected: Column => Column): NumberOfRowsConstraint = { new NumberOfRowsConstraint(expected(new Column(countKey))) } def greaterThan(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ > expected) } def lessThan(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ < expected) } def equalTo(expected: Int): NumberOfRowsConstraint = { NumberOfRowsConstraint(_ === expected) } } case class NumberOfRowsConstraintResult(constraint: NumberOfRowsConstraint, actual: Long, status: ConstraintStatus) extends ConstraintResult[NumberOfRowsConstraint] { val message: String = { val expected = constraint.expected status match { case ConstraintSuccess => s"The number of rows satisfies $expected." case ConstraintFailure => s"The actual number of rows $actual does not satisfy $expected." case default => throw IllegalConstraintResultException(this) } } }