org.apache.spark.sql.expressions.Window Scala Examples
The following examples show how to use org.apache.spark.sql.expressions.Window.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HivePlanTest.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 2
Source File: FlintTestData.scala From flint with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.sql.functions.{ udf, sum } import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.percent_rank trait FlintTestData { protected def sqlContext: SQLContext private object internalImplicits extends SQLImplicits { override protected def _sqlContext: SQLContext = sqlContext } import internalImplicits._ import FlintTestData._ protected lazy val testData: DataFrame = { val df = sqlContext.sparkContext.parallelize( (0 to 97).map(i => TestData(i.toLong, i.toDouble)) ).toDF() df } protected lazy val testData2: DataFrame = { val df = sqlContext.sparkContext.parallelize( (0 to 101).map(i => TestData2(i.toLong, i.toDouble, -i.toDouble)) ).toDF() df } protected lazy val testDataCached: DataFrame = { val df = DFConverter.newDataFrame(testData) df.cache df.count df } protected val withTime2Column = { df: DataFrame => df.withColumn("time2", df("time") * 2) } protected val withTime3ColumnUdf = { df: DataFrame => val testUdf = udf({ time: Long => time * 2 }) df.withColumn("time3", testUdf(df("time"))) } protected val selectV = { df: DataFrame => df.select("v") } protected val selectExprVPlusOne = { df: DataFrame => df.selectExpr("v + 1 as v") } protected val filterV = { df: DataFrame => df.filter(df("v") > 0) } protected val orderByTime = { df: DataFrame => df.orderBy("time") } protected val orderByV = { df: DataFrame => df.orderBy("v") } protected val addRankColumn = { df: DataFrame => df.withColumn("rank", percent_rank().over(Window.partitionBy("time").orderBy("v"))) } protected val selectSumV = { df: DataFrame => df.select(sum("v")) } protected val selectExprSumV = { df: DataFrame => df.selectExpr("sum(v)") } protected val groupByTimeSumV = { df: DataFrame => df.groupBy("time").agg(sum("v").alias("v")) } protected val repartition = { df: DataFrame => df.repartition(10) } protected val coalesce = { df: DataFrame => df.coalesce(5) } protected val cache = { df: DataFrame => df.cache(); df.count(); df } protected val unpersist = { df: DataFrame => df.unpersist() } } object FlintTestData { case class TestData(time: Long, v: Double) case class TestData2(time: Long, v: Double, v2: Double) }
Example 3
Source File: HivePlanTest.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 4
Source File: HivePlanTest.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 5
Source File: DeltaLoad.scala From m3d-engine with Apache License 2.0 | 5 votes |
package com.adidas.analytics.algo import com.adidas.analytics.algo.DeltaLoad._ import com.adidas.analytics.algo.core.Algorithm import com.adidas.analytics.algo.shared.DateComponentDerivation import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration import com.adidas.analytics.util.DataFrameUtils._ import com.adidas.analytics.util._ import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession} import org.apache.spark.storage.StorageLevel import org.slf4j.{Logger, LoggerFactory} private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = { // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records) val partitionWindow = Window .partitionBy(businessKey.map(col): _*) .orderBy(technicalKey.map(component => col(component).desc): _*) // Ranking & projection val rankedDeltaRecords = deltaRecords .withColumn(rankingColumnName, row_number().over(partitionWindow)) .filter(upsertRecordsModesFilterFunction) rankedDeltaRecords .filter(rankedDeltaRecords(rankingColumnName) === 1) .selectExpr(resultColumns: _*) } protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = { logger.info("Adding partitioning information if needed") try { dataFrames.map { df => if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){ df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions)) } else df } } catch { case e: Throwable => logger.error("Cannot add partitioning information for data frames.", e) //TODO: Handle failure case properly throw new RuntimeException("Unable to transform data frames.", e) } } } object DeltaLoad { private val logger: Logger = LoggerFactory.getLogger(getClass) def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = { new DeltaLoad(spark, dfs, configLocation) } }
Example 6
Source File: HivePlanTest.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 7
Source File: HivePlanTest.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHive class HivePlanTest extends QueryTest { import TestHive._ import TestHive.implicits._ //自定义函数常量折叠 test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t") val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } //共享相同分区的窗口表达式和order by子句 test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 8
Source File: HivePlanTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions._ import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import spark.sql import spark.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t") val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 9
Source File: MimirSparkRuntimeUtils.scala From mimir with Apache License 2.0 | 5 votes |
package mimir.exec.spark import org.apache.spark.sql.DataFrame import org.apache.spark.sql.types.{ DataType, LongType } import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{ spark_partition_id, monotonically_increasing_id, count, sum, first, lit, col } object MimirSparkRuntimeUtils { def zipWithIndex(df: DataFrame, offset: Long = 1, indexName: String = "ROWIDX", indexType:DataType = LongType): DataFrame = { val dfWithPartitionId = df.withColumn("partition_id", spark_partition_id()).withColumn("inc_id", monotonically_increasing_id()) val partitionOffsets = dfWithPartitionId .groupBy("partition_id") .agg(count(lit(1)) as "cnt", first("inc_id") as "inc_id") .orderBy("partition_id") .select(col("partition_id"), sum("cnt").over(Window.orderBy("partition_id")) - col("cnt") - col("inc_id") + lit(offset) as "cnt" ) .collect() .map(row => (row.getInt(0), row.getLong(1))) .toMap val theUdf = org.apache.spark.sql.functions.udf( (partitionId: Int) => partitionOffsets(partitionId), LongType ) dfWithPartitionId .withColumn("partition_offset", theUdf(col("partition_id"))) .withColumn(indexName, (col("partition_offset") + col("inc_id")).cast(indexType)) .drop("partition_id", "partition_offset", "inc_id") } def writeDataSink(dataframe:DataFrame, format:String, options:Map[String, String], save:Option[String]) = { val dsFormat = dataframe.write.format(format) val dsOptions = options.toSeq.foldLeft(dsFormat)( (ds, opt) => opt._1 match { case "mode" => ds.mode(opt._2) case _ => ds.option(opt._1, opt._2) }) save match { case None => dsOptions.save case Some(outputFile) => { if(format.equals("com.github.potix2.spark.google.spreadsheets")){ val gsldfparts = outputFile.split("\\/") val gsldf = s"${gsldfparts(gsldfparts.length-2)}/${gsldfparts(gsldfparts.length-1)}" dsOptions.save(gsldf) } else{ dsOptions.save(outputFile) } } } } }
Example 10
Source File: HivePlanTest.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hive.execution import org.apache.spark.sql.functions._ import org.apache.spark.sql.QueryTest import org.apache.spark.sql.catalyst.plans.logical import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.hive.test.TestHiveSingleton class HivePlanTest extends QueryTest with TestHiveSingleton { import hiveContext.sql import hiveContext.implicits._ test("udf constant folding") { Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t") val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan comparePlans(optimized, correctAnswer) } test("window expressions sharing the same partition by and order by clause") { val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val") val window = Window. partitionBy($"grp"). orderBy($"val") val query = df.select( $"id", sum($"val").over(window.rowsBetween(-1, 1)), sum($"val").over(window.rangeBetween(-1, 1)) ) val plan = query.queryExecution.analyzed assert(plan.collect{ case w: logical.Window => w }.size === 1, "Should have only 1 Window operator.") } }
Example 11
Source File: OilPriceFunc.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.geomesa import java.text.SimpleDateFormat import java.util.Calendar import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{udf, window, last, col, lag} object OilPriceFunc { // use this if the window function misbehaves due to timezone e.g. BST // ./spark-shell --driver-java-options "-Duser.timezone=UTC" // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC' // define a function to reformat the date field def convert(date:String) : String = { val df1 = new SimpleDateFormat("dd/MM/yyyy") val dt = df1.parse(date) val df2 = new SimpleDateFormat("yyyy-MM-dd") df2.format(dt) } // create and save oil price changes def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = { val oilPriceDF = spark. read. option("header", "true"). option("inferSchema", "true"). csv(inputfile) val convertDateUDF = udf { (Date: String) => convert(Date) } val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE"))) // offset to start at beginning of week val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days")) val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window") // windowLastDF.show(20, false) val sortedWindow = Window.orderBy("window.start") val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow) val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol) // lagLastColDF.show(20, false) val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) => var change = ((last - prevLast) compare 0).signum if (change == -1) change = 0 change.toDouble } val findDateTwoDaysAgoUDF = udf { (date: String) => val dateFormat = new SimpleDateFormat("yyyy-MM-dd") val cal = Calendar.getInstance cal.setTime(dateFormat.parse(date)) cal.add(Calendar.DATE, -3) dateFormat.format(cal.getTime) } val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc( lagLastColDF("last(PRICE)"), lagLastColDF("lastPrev(PRICE)") )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end"))) // oilPriceChangeDF.show(20, false) oilPriceChangeDF.select("label", "commonFriday"). write. format("com.databricks.spark.csv"). option("header", "true"). //.option("codec", "org.apache.hadoop.io.compress.GzipCodec") save(outputfile) } }