org.apache.spark.sql.functions.window Scala Examples
The following examples show how to use org.apache.spark.sql.functions.window.
Example 1
Source File: DNSstat.scala From jdbcsink with Apache License 2.0 | 6 votes |
import org.apache.spark.sql.SparkSession import java.util.Properties import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{from_json,window} import java.sql.{Connection,Statement,DriverManager} import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row class JDBCSink() extends ForeachWriter[Row]{ val driver = "com.mysql.jdbc.Driver" var connection:Connection = _ var statement:Statement = _ def open(partitionId: Long,version: Long): Boolean = { Class.forName(driver) connection = DriverManager.getConnection("jdbc:mysql://", "root", "mysqladmin") statement = connection.createStatement true } def process(value: Row): Unit = { statement.executeUpdate("replace into DNSStat(ip,domain,time,count) values(" + "'" + value.getString(0) + "'" + ","//ip + "'" + value.getString(1) + "'" + ","//domain + "'" + value.getTimestamp(2) + "'" + "," //time + value.getLong(3) //count + ")") } def close(errorOrNull: Throwable): Unit = { connection.close } } object DNSstatJob{ val schema: StructType = StructType( Seq(StructField("Vendor", StringType,true), StructField("Id", IntegerType,true), StructField("Time", LongType,true), StructField("Conn", StructType(Seq( StructField("Proto", IntegerType, true), StructField("Sport", IntegerType, true), StructField("Dport", IntegerType, true), StructField("Sip", StringType, true), StructField("Dip", StringType, true) )), true), StructField("Dns", StructType(Seq( StructField("Domain", StringType, true), StructField("IpCount", IntegerType, true), StructField("Ip", StringType, true) )), true))) def main(args: Array[String]) { val spark=SparkSession .builder .appName("DNSJob") .config("spark.some.config.option", "some-value") .getOrCreate() import spark.implicits._ val connectionProperties = new Properties() connectionProperties.put("user", "root") connectionProperties.put("password", "mysqladmin") val bruteForceTab = .jdbc("jdbc:mysql://", "DNSTab",connectionProperties) bruteForceTab.registerTempTable("DNSTab") val lines = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "") .option("subscribe","xdr") //.option("startingOffsets","earliest") .option("startingOffsets","latest") .load() .select(from_json($"value".cast(StringType),schema).as("jsonData")) lines.registerTempTable("xdr") val filterDNS = spark.sql("select CAST(from_unixtime(xdr.jsonData.Time DIV 1000000) as timestamp) as time,xdr.jsonData.Conn.Sip as sip, xdr.jsonData.Dns.Domain from xdr inner join DNSTab on xdr.jsonData.Dns.domain = DNSTab.domain") val windowedCounts = filterDNS .withWatermark("time","5 minutes") .groupBy(window($"time", "1 minutes", "1 minutes"),$"sip",$"domain") .count() .select($"sip",$"domain",$"window.start",$"count") val writer = new JDBCSink() val query = windowedCounts .writeStream .foreach(writer) .outputMode("update") .option("checkpointLocation","/checkpoint/") .start() query.awaitTermination() } }
Example 2
Source File: MicroBatchExecutionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.functions.{count, window} import org.apache.spark.sql.streaming.StreamTest class MicroBatchExecutionSuite extends StreamTest with BeforeAndAfter { import testImplicits._ after { } test("SPARK-24156: do not plan a no-data batch again after it has already been planned") { val inputData = MemoryStream[Int] val df = inputData.toDF() .withColumn("eventTime", $"value".cast("timestamp")) .withWatermark("eventTime", "10 seconds") .groupBy(window($"eventTime", "5 seconds") as 'window) .agg(count("*") as 'count) .select($"window".getField("start").cast("long").as[Long], $"count".as[Long]) testStream(df)( AddData(inputData, 10, 11, 12, 13, 14, 15), // Set watermark to 5 CheckAnswer(), AddData(inputData, 25), // Set watermark to 15 to make MicroBatchExecution run no-data batch CheckAnswer((10, 5)), // Last batch should be a no-data batch StopStream, Execute { q => // Delete the last committed batch from the commit log to signify that the last batch // (a no-data batch) never completed val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) q.commitLog.purgeAfter(commit - 1) }, // Add data before start so that MicroBatchExecution can plan a batch. It should not, // it should first re-run the incomplete no-data batch and then run a new batch to process // new data. AddData(inputData, 30), StartStream(), CheckNewAnswer((15, 1)), // This should not throw the error reported in SPARK-24156 StopStream, Execute { q => // Delete the entire commit log val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) q.commitLog.purge(commit + 1) }, AddData(inputData, 50), StartStream(), CheckNewAnswer((25, 1), (30, 1)) // This should not throw the error reported in SPARK-24156 ) } }
Example 3
Source File: StreamingQueryListenerSampleJob.scala From spark-monitoring with MIT License | 5 votes |
package import import import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.metrics.UserMetricsSystems import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.window import org.apache.spark.sql.types.{StringType, StructType, TimestampType} object StreamingQueryListenerSampleJob extends Logging { private final val METRICS_NAMESPACE = "streamingquerylistenersamplejob" private final val COUNTER_NAME = "rowcounter" def main(args: Array[String]): Unit = { // Configure our logging TryWith(getClass.getResourceAsStream("/com/microsoft/pnp/samplejob/")) { stream => { Log4jConfiguration.configure(stream) } } logTrace("Trace message from StreamingQueryListenerSampleJob") logDebug("Debug message from StreamingQueryListenerSampleJob") logInfo("Info message from StreamingQueryListenerSampleJob") logWarning("Warning message from StreamingQueryListenerSampleJob") logError("Error message from StreamingQueryListenerSampleJob") val spark = SparkSession .builder .getOrCreate import spark.implicits._ // this path has sample files provided by databricks for trying out purpose val inputPath = "/databricks-datasets/structured-streaming/events/" val jsonSchema = new StructType().add("time", TimestampType).add("action", StringType) val driverMetricsSystem = UserMetricsSystems .getMetricSystem(METRICS_NAMESPACE, builder => { builder.registerCounter(COUNTER_NAME) }) driverMetricsSystem.counter(COUNTER_NAME).inc // Similar to definition of staticInputDF above, just using `readStream` instead of `read` val streamingInputDF = spark .readStream // `readStream` instead of `read` for creating streaming DataFrame .schema(jsonSchema) // Set the schema of the JSON data .option("maxFilesPerTrigger", 1) // Treat a sequence of files as a stream by picking one file at a time .json(inputPath) driverMetricsSystem.counter(COUNTER_NAME).inc(5) val streamingCountsDF = streamingInputDF .groupBy($"action", window($"time", "1 hour")) .count() // Is this DF actually a streaming DF? streamingCountsDF.isStreaming driverMetricsSystem.counter(COUNTER_NAME).inc(10) val query = streamingCountsDF .writeStream .format("memory") // memory = store in-memory table (for testing only in Spark 2.0) .queryName("counts") // counts = name of the in-memory table .outputMode("complete") // complete = all the counts should be in the table .start() } }
Example 4
Source File: OilPriceFunc.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.geomesa import java.text.SimpleDateFormat import java.util.Calendar import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{udf, window, last, col, lag} object OilPriceFunc { // use this if the window function misbehaves due to timezone e.g. BST // ./spark-shell --driver-java-options "-Duser.timezone=UTC" // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC' // define a function to reformat the date field def convert(date:String) : String = { val df1 = new SimpleDateFormat("dd/MM/yyyy") val dt = df1.parse(date) val df2 = new SimpleDateFormat("yyyy-MM-dd") df2.format(dt) } // create and save oil price changes def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = { val oilPriceDF = spark. read. option("header", "true"). option("inferSchema", "true"). csv(inputfile) val convertDateUDF = udf { (Date: String) => convert(Date) } val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE"))) // offset to start at beginning of week val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days")) val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window") //, false) val sortedWindow = Window.orderBy("window.start") val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow) val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol) //, false) val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) => var change = ((last - prevLast) compare 0).signum if (change == -1) change = 0 change.toDouble } val findDateTwoDaysAgoUDF = udf { (date: String) => val dateFormat = new SimpleDateFormat("yyyy-MM-dd") val cal = Calendar.getInstance cal.setTime(dateFormat.parse(date)) cal.add(Calendar.DATE, -3) dateFormat.format(cal.getTime) } val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc( lagLastColDF("last(PRICE)"), lagLastColDF("lastPrev(PRICE)") )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end"))) //, false)"label", "commonFriday"). write. format("com.databricks.spark.csv"). option("header", "true"). //.option("codec", "") save(outputfile) } }
Example 5
Source File: KafkaStructuredStreamingDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.structuredStreaming.kafka import java.sql.Timestamp import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.window object KafkaStructuredStreamingDemo{ def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("KafkaStreamingDemo") .getOrCreate() import spark.implicits._ val df = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", "topic") .load() // 请使用OSS作为Checkpoint存储 val checkpointLocation3 = "oss://bucket/checkpoint3/" val windowedCountsWithWatermark = wordsWithTimestamp .withWatermark("timestamp", "5 seconds") .groupBy( window($"timestamp", "6 seconds", "3 seconds"), $"word" ).count() val query3 = windowedCountsWithWatermark.writeStream .outputMode("append") .format("console") .option("checkpointLocation", checkpointLocation3) .start() query3.awaitTermination() } }