org.apache.spark.sql.expressions.Window Scala Example

Source File: HivePlanTest.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
}

Source File: FlintTestData.scala From flint with Apache License 2.0

5 votes

package org.apache.spark.sql
import org.apache.spark.sql.functions.{ udf, sum }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.percent_rank

trait FlintTestData {
  protected def sqlContext: SQLContext

  private object internalImplicits extends SQLImplicits {
    override protected def _sqlContext: SQLContext = sqlContext
  }

  import internalImplicits._
  import FlintTestData._

  protected lazy val testData: DataFrame = {
    val df = sqlContext.sparkContext.parallelize(
      (0 to 97).map(i => TestData(i.toLong, i.toDouble))
    ).toDF()
    df
  }

  protected lazy val testData2: DataFrame = {
    val df = sqlContext.sparkContext.parallelize(
      (0 to 101).map(i => TestData2(i.toLong, i.toDouble, -i.toDouble))
    ).toDF()
    df
  }

  protected lazy val testDataCached: DataFrame = {
    val df = DFConverter.newDataFrame(testData)
    df.cache
    df.count
    df
  }

  protected val withTime2Column = { df: DataFrame => df.withColumn("time2", df("time") * 2) }
  protected val withTime3ColumnUdf = { df: DataFrame =>
    val testUdf = udf({ time: Long => time * 2 })
    df.withColumn("time3", testUdf(df("time")))
  }
  protected val selectV = { df: DataFrame => df.select("v") }
  protected val selectExprVPlusOne = { df: DataFrame => df.selectExpr("v + 1 as v") }
  protected val filterV = { df: DataFrame => df.filter(df("v") > 0) }

  protected val orderByTime = { df: DataFrame => df.orderBy("time") }
  protected val orderByV = { df: DataFrame => df.orderBy("v") }
  protected val addRankColumn = { df: DataFrame =>
    df.withColumn("rank", percent_rank().over(Window.partitionBy("time").orderBy("v")))
  }
  protected val selectSumV = { df: DataFrame => df.select(sum("v")) }
  protected val selectExprSumV = { df: DataFrame => df.selectExpr("sum(v)") }
  protected val groupByTimeSumV = { df: DataFrame => df.groupBy("time").agg(sum("v").alias("v")) }
  protected val repartition = { df: DataFrame => df.repartition(10) }
  protected val coalesce = { df: DataFrame => df.coalesce(5) }

  protected val cache = { df: DataFrame => df.cache(); df.count(); df }
  protected val unpersist = { df: DataFrame => df.unpersist() }
}

object FlintTestData {
  case class TestData(time: Long, v: Double)
  case class TestData2(time: Long, v: Double, v2: Double)
}

Source File: HivePlanTest.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
}

Source File: HivePlanTest.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
}

Source File: DeltaLoad.scala From m3d-engine with Apache License 2.0

5 votes

package com.adidas.analytics.algo

import com.adidas.analytics.algo.DeltaLoad._
import com.adidas.analytics.algo.core.Algorithm
import com.adidas.analytics.algo.shared.DateComponentDerivation
import com.adidas.analytics.config.DeltaLoadConfiguration.PartitionedDeltaLoadConfiguration
import com.adidas.analytics.util.DataFrameUtils._
import com.adidas.analytics.util._
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.{DataFrame, Dataset, Row, SparkSession}
import org.apache.spark.storage.StorageLevel
import org.slf4j.{Logger, LoggerFactory}


  private def getUpsertRecords(deltaRecords: Dataset[Row], resultColumns: Seq[String]): Dataset[Row] = {
    // Create partition window - Partitioning by delta records logical key (i.e. technical key of active records)
    val partitionWindow = Window
      .partitionBy(businessKey.map(col): _*)
      .orderBy(technicalKey.map(component => col(component).desc): _*)

    // Ranking & projection
    val rankedDeltaRecords = deltaRecords
      .withColumn(rankingColumnName, row_number().over(partitionWindow))
      .filter(upsertRecordsModesFilterFunction)

    rankedDeltaRecords
      .filter(rankedDeltaRecords(rankingColumnName) === 1)
      .selectExpr(resultColumns: _*)
  }

  protected def withDatePartitions(spark: SparkSession, dfs: DFSWrapper, dataFrames: Vector[DataFrame]): Vector[DataFrame] = {
    logger.info("Adding partitioning information if needed")
    try {
      dataFrames.map { df =>
        if (df.columns.toSeq.intersect(targetPartitions) != targetPartitions){
          df.transform(withDateComponents(partitionSourceColumn, partitionSourceColumnFormat, targetPartitions))
        }
        else df
      }
    } catch {
      case e: Throwable =>
        logger.error("Cannot add partitioning information for data frames.", e)
        //TODO: Handle failure case properly
        throw new RuntimeException("Unable to transform data frames.", e)
    }
  }
}


object DeltaLoad {

  private val logger: Logger = LoggerFactory.getLogger(getClass)

  def apply(spark: SparkSession, dfs: DFSWrapper, configLocation: String): DeltaLoad = {
    new DeltaLoad(spark, dfs, configLocation)
  }
}

Source File: HivePlanTest.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
}

Source File: HivePlanTest.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHive

class HivePlanTest extends QueryTest {
  import TestHive._
  import TestHive.implicits._
  //自定义函数常量折叠
  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t")
    val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }
  //共享相同分区的窗口表达式和order by子句
  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
}

Source File: HivePlanTest.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions._
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import spark.sql
  import spark.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").createOrReplaceTempView("t")
    val optimized = sql("SELECT cos(null) AS c FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) AS c FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
}

Source File: MimirSparkRuntimeUtils.scala From mimir with Apache License 2.0

5 votes

package mimir.exec.spark

import org.apache.spark.sql.DataFrame
import org.apache.spark.sql.types.{ DataType, LongType }
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{
  spark_partition_id,
  monotonically_increasing_id,
  count,
  sum,
  first,
  lit,
  col
}

object MimirSparkRuntimeUtils
{
  def zipWithIndex(df: DataFrame, offset: Long = 1, indexName: String = "ROWIDX", indexType:DataType = LongType): DataFrame = {
    val dfWithPartitionId = df.withColumn("partition_id", spark_partition_id()).withColumn("inc_id", monotonically_increasing_id())

    val partitionOffsets = dfWithPartitionId
        .groupBy("partition_id")
        .agg(count(lit(1)) as "cnt", first("inc_id") as "inc_id")
        .orderBy("partition_id")
        .select(col("partition_id"), sum("cnt").over(Window.orderBy("partition_id")) - col("cnt") - col("inc_id") + lit(offset) as "cnt" )
        .collect()
        .map(row => (row.getInt(0), row.getLong(1)))
        .toMap

     val theUdf = org.apache.spark.sql.functions.udf(
       (partitionId: Int) => partitionOffsets(partitionId), 
       LongType
     )
     
     dfWithPartitionId
        .withColumn("partition_offset", theUdf(col("partition_id")))
        .withColumn(indexName, (col("partition_offset") + col("inc_id")).cast(indexType))
        .drop("partition_id", "partition_offset", "inc_id")
  }

  def writeDataSink(dataframe:DataFrame, format:String, options:Map[String, String], save:Option[String]) = {
    val dsFormat = dataframe.write.format(format) 
    val dsOptions = options.toSeq.foldLeft(dsFormat)( (ds, opt) => opt._1 match { 
      case "mode" => ds.mode(opt._2) 
      case _ => ds.option(opt._1, opt._2)
      })
    save match {
      case None => dsOptions.save
      case Some(outputFile) => {
        if(format.equals("com.github.potix2.spark.google.spreadsheets")){
          val gsldfparts = outputFile.split("\\/") 
          val gsldf = s"${gsldfparts(gsldfparts.length-2)}/${gsldfparts(gsldfparts.length-1)}"
          dsOptions.save(gsldf)
        }
        else{
          dsOptions.save(outputFile)
        }
      }
    }
  }
}

Source File: HivePlanTest.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.execution

import org.apache.spark.sql.functions._
import org.apache.spark.sql.QueryTest
import org.apache.spark.sql.catalyst.plans.logical
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.hive.test.TestHiveSingleton

class HivePlanTest extends QueryTest with TestHiveSingleton {
  import hiveContext.sql
  import hiveContext.implicits._

  test("udf constant folding") {
    Seq.empty[Tuple1[Int]].toDF("a").registerTempTable("t")
    val optimized = sql("SELECT cos(null) FROM t").queryExecution.optimizedPlan
    val correctAnswer = sql("SELECT cast(null as double) FROM t").queryExecution.optimizedPlan

    comparePlans(optimized, correctAnswer)
  }

  test("window expressions sharing the same partition by and order by clause") {
    val df = Seq.empty[(Int, String, Int, Int)].toDF("id", "grp", "seq", "val")
    val window = Window.
      partitionBy($"grp").
      orderBy($"val")
    val query = df.select(
      $"id",
      sum($"val").over(window.rowsBetween(-1, 1)),
      sum($"val").over(window.rangeBetween(-1, 1))
    )
    val plan = query.queryExecution.analyzed
    assert(plan.collect{ case w: logical.Window => w }.size === 1,
      "Should have only 1 Window operator.")
  }
}

Source File: OilPriceFunc.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.geomesa

import java.text.SimpleDateFormat
import java.util.Calendar

import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.expressions.Window
import org.apache.spark.sql.functions.{udf, window, last, col, lag}

object OilPriceFunc {

    // use this if the window function misbehaves due to timezone e.g. BST
    // ./spark-shell --driver-java-options "-Duser.timezone=UTC"
    // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC'

    // define a function to reformat the date field
    def convert(date:String) : String = {
      val df1 = new SimpleDateFormat("dd/MM/yyyy")
      val dt = df1.parse(date)
      val df2 = new SimpleDateFormat("yyyy-MM-dd")
      df2.format(dt)
    }

    // create and save oil price changes
    def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = {

      val oilPriceDF = spark.
        read.
        option("header", "true").
        option("inferSchema", "true").
        csv(inputfile)

      val convertDateUDF = udf { (Date: String) => convert(Date) }

      val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE")))

      // offset to start at beginning of week
      val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days"))

      val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window")

//      windowLastDF.show(20, false)

      val sortedWindow = Window.orderBy("window.start")

      val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow)
      val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol)

//      lagLastColDF.show(20, false)

      val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) =>
        var change = ((last - prevLast) compare 0).signum
        if (change == -1)
          change = 0
        change.toDouble
      }

      val findDateTwoDaysAgoUDF = udf { (date: String) =>
        val dateFormat = new SimpleDateFormat("yyyy-MM-dd")
        val cal = Calendar.getInstance
        cal.setTime(dateFormat.parse(date))
        cal.add(Calendar.DATE, -3)
        dateFormat.format(cal.getTime)
      }

      val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc(
        lagLastColDF("last(PRICE)"),
        lagLastColDF("lastPrev(PRICE)")
      )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end")))

//      oilPriceChangeDF.show(20, false)

      oilPriceChangeDF.select("label", "commonFriday").
        write.
        format("com.databricks.spark.csv").
        option("header", "true").
        //.option("codec", "org.apache.hadoop.io.compress.GzipCodec")
        save(outputfile)
    }
}

org.apache.spark.sql.expressions.Window Scala Examples