org.apache.spark.sql.functions.window Scala Examples
The following examples show how to use org.apache.spark.sql.functions.window.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: DNSstat.scala From jdbcsink with Apache License 2.0 | 6 votes |
import org.apache.spark.sql.SparkSession import java.util.Properties import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{from_json,window} import java.sql.{Connection,Statement,DriverManager} import org.apache.spark.sql.ForeachWriter import org.apache.spark.sql.Row class JDBCSink() extends ForeachWriter[Row]{ val driver = "com.mysql.jdbc.Driver" var connection:Connection = _ var statement:Statement = _ def open(partitionId: Long,version: Long): Boolean = { Class.forName(driver) connection = DriverManager.getConnection("jdbc:mysql://10.88.1.102:3306/aptwebservice", "root", "mysqladmin") statement = connection.createStatement true } def process(value: Row): Unit = { statement.executeUpdate("replace into DNSStat(ip,domain,time,count) values(" + "'" + value.getString(0) + "'" + ","//ip + "'" + value.getString(1) + "'" + ","//domain + "'" + value.getTimestamp(2) + "'" + "," //time + value.getLong(3) //count + ")") } def close(errorOrNull: Throwable): Unit = { connection.close } } object DNSstatJob{ val schema: StructType = StructType( Seq(StructField("Vendor", StringType,true), StructField("Id", IntegerType,true), StructField("Time", LongType,true), StructField("Conn", StructType(Seq( StructField("Proto", IntegerType, true), StructField("Sport", IntegerType, true), StructField("Dport", IntegerType, true), StructField("Sip", StringType, true), StructField("Dip", StringType, true) )), true), StructField("Dns", StructType(Seq( StructField("Domain", StringType, true), StructField("IpCount", IntegerType, true), StructField("Ip", StringType, true) )), true))) def main(args: Array[String]) { val spark=SparkSession .builder .appName("DNSJob") .config("spark.some.config.option", "some-value") .getOrCreate() import spark.implicits._ val connectionProperties = new Properties() connectionProperties.put("user", "root") connectionProperties.put("password", "mysqladmin") val bruteForceTab = spark.read .jdbc("jdbc:mysql://10.88.1.102:3306/aptwebservice", "DNSTab",connectionProperties) bruteForceTab.registerTempTable("DNSTab") val lines = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "10.94.1.110:9092") .option("subscribe","xdr") //.option("startingOffsets","earliest") .option("startingOffsets","latest") .load() .select(from_json($"value".cast(StringType),schema).as("jsonData")) lines.registerTempTable("xdr") val filterDNS = spark.sql("select CAST(from_unixtime(xdr.jsonData.Time DIV 1000000) as timestamp) as time,xdr.jsonData.Conn.Sip as sip, xdr.jsonData.Dns.Domain from xdr inner join DNSTab on xdr.jsonData.Dns.domain = DNSTab.domain") val windowedCounts = filterDNS .withWatermark("time","5 minutes") .groupBy(window($"time", "1 minutes", "1 minutes"),$"sip",$"domain") .count() .select($"sip",$"domain",$"window.start",$"count") val writer = new JDBCSink() val query = windowedCounts .writeStream .foreach(writer) .outputMode("update") .option("checkpointLocation","/checkpoint/") .start() query.awaitTermination() } }
Example 2
Source File: MicroBatchExecutionSuite.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.scalatest.BeforeAndAfter import org.apache.spark.sql.functions.{count, window} import org.apache.spark.sql.streaming.StreamTest class MicroBatchExecutionSuite extends StreamTest with BeforeAndAfter { import testImplicits._ after { sqlContext.streams.active.foreach(_.stop()) } test("SPARK-24156: do not plan a no-data batch again after it has already been planned") { val inputData = MemoryStream[Int] val df = inputData.toDF() .withColumn("eventTime", $"value".cast("timestamp")) .withWatermark("eventTime", "10 seconds") .groupBy(window($"eventTime", "5 seconds") as 'window) .agg(count("*") as 'count) .select($"window".getField("start").cast("long").as[Long], $"count".as[Long]) testStream(df)( AddData(inputData, 10, 11, 12, 13, 14, 15), // Set watermark to 5 CheckAnswer(), AddData(inputData, 25), // Set watermark to 15 to make MicroBatchExecution run no-data batch CheckAnswer((10, 5)), // Last batch should be a no-data batch StopStream, Execute { q => // Delete the last committed batch from the commit log to signify that the last batch // (a no-data batch) never completed val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) q.commitLog.purgeAfter(commit - 1) }, // Add data before start so that MicroBatchExecution can plan a batch. It should not, // it should first re-run the incomplete no-data batch and then run a new batch to process // new data. AddData(inputData, 30), StartStream(), CheckNewAnswer((15, 1)), // This should not throw the error reported in SPARK-24156 StopStream, Execute { q => // Delete the entire commit log val commit = q.commitLog.getLatest().map(_._1).getOrElse(-1L) q.commitLog.purge(commit + 1) }, AddData(inputData, 50), StartStream(), CheckNewAnswer((25, 1), (30, 1)) // This should not throw the error reported in SPARK-24156 ) } }
Example 3
Source File: StreamingQueryListenerSampleJob.scala From spark-monitoring with MIT License | 5 votes |
package com.microsoft.pnp.samplejob import com.microsoft.pnp.logging.Log4jConfiguration import com.microsoft.pnp.util.TryWith import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.metrics.UserMetricsSystems import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.window import org.apache.spark.sql.types.{StringType, StructType, TimestampType} object StreamingQueryListenerSampleJob extends Logging { private final val METRICS_NAMESPACE = "streamingquerylistenersamplejob" private final val COUNTER_NAME = "rowcounter" def main(args: Array[String]): Unit = { // Configure our logging TryWith(getClass.getResourceAsStream("/com/microsoft/pnp/samplejob/log4j.properties")) { stream => { Log4jConfiguration.configure(stream) } } logTrace("Trace message from StreamingQueryListenerSampleJob") logDebug("Debug message from StreamingQueryListenerSampleJob") logInfo("Info message from StreamingQueryListenerSampleJob") logWarning("Warning message from StreamingQueryListenerSampleJob") logError("Error message from StreamingQueryListenerSampleJob") val spark = SparkSession .builder .getOrCreate import spark.implicits._ // this path has sample files provided by databricks for trying out purpose val inputPath = "/databricks-datasets/structured-streaming/events/" val jsonSchema = new StructType().add("time", TimestampType).add("action", StringType) val driverMetricsSystem = UserMetricsSystems .getMetricSystem(METRICS_NAMESPACE, builder => { builder.registerCounter(COUNTER_NAME) }) driverMetricsSystem.counter(COUNTER_NAME).inc // Similar to definition of staticInputDF above, just using `readStream` instead of `read` val streamingInputDF = spark .readStream // `readStream` instead of `read` for creating streaming DataFrame .schema(jsonSchema) // Set the schema of the JSON data .option("maxFilesPerTrigger", 1) // Treat a sequence of files as a stream by picking one file at a time .json(inputPath) driverMetricsSystem.counter(COUNTER_NAME).inc(5) val streamingCountsDF = streamingInputDF .groupBy($"action", window($"time", "1 hour")) .count() // Is this DF actually a streaming DF? streamingCountsDF.isStreaming driverMetricsSystem.counter(COUNTER_NAME).inc(10) val query = streamingCountsDF .writeStream .format("memory") // memory = store in-memory table (for testing only in Spark 2.0) .queryName("counts") // counts = name of the in-memory table .outputMode("complete") // complete = all the counts should be in the table .start() } }
Example 4
Source File: OilPriceFunc.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.geomesa import java.text.SimpleDateFormat import java.util.Calendar import org.apache.spark.sql.SparkSession import org.apache.spark.sql.expressions.Window import org.apache.spark.sql.functions.{udf, window, last, col, lag} object OilPriceFunc { // use this if the window function misbehaves due to timezone e.g. BST // ./spark-shell --driver-java-options "-Duser.timezone=UTC" // ./spark-submit --conf 'spark.driver.extraJavaOptions=-Duser.timezone=UTC' // define a function to reformat the date field def convert(date:String) : String = { val df1 = new SimpleDateFormat("dd/MM/yyyy") val dt = df1.parse(date) val df2 = new SimpleDateFormat("yyyy-MM-dd") df2.format(dt) } // create and save oil price changes def createOilPriceDF(inputfile: String, outputfile: String, spark: SparkSession) = { val oilPriceDF = spark. read. option("header", "true"). option("inferSchema", "true"). csv(inputfile) val convertDateUDF = udf { (Date: String) => convert(Date) } val oilPriceDatedDF = oilPriceDF.withColumn("DATE", convertDateUDF(oilPriceDF("DATE"))) // offset to start at beginning of week val windowDF = oilPriceDatedDF.groupBy(window(oilPriceDatedDF.col("DATE"), "7 days", "7 days", "4 days")) val windowLastDF = windowDF.agg(last("PRICE") as "last(PRICE)").sort("window") // windowLastDF.show(20, false) val sortedWindow = Window.orderBy("window.start") val lagLastCol = lag(col("last(PRICE)"), 1).over(sortedWindow) val lagLastColDF = windowLastDF.withColumn("lastPrev(PRICE)", lagLastCol) // lagLastColDF.show(20, false) val simplePriceChangeFunc = udf { (last: Double, prevLast: Double) => var change = ((last - prevLast) compare 0).signum if (change == -1) change = 0 change.toDouble } val findDateTwoDaysAgoUDF = udf { (date: String) => val dateFormat = new SimpleDateFormat("yyyy-MM-dd") val cal = Calendar.getInstance cal.setTime(dateFormat.parse(date)) cal.add(Calendar.DATE, -3) dateFormat.format(cal.getTime) } val oilPriceChangeDF = lagLastColDF.withColumn("label", simplePriceChangeFunc( lagLastColDF("last(PRICE)"), lagLastColDF("lastPrev(PRICE)") )).withColumn("commonFriday", findDateTwoDaysAgoUDF(lagLastColDF("window.end"))) // oilPriceChangeDF.show(20, false) oilPriceChangeDF.select("label", "commonFriday"). write. format("com.databricks.spark.csv"). option("header", "true"). //.option("codec", "org.apache.hadoop.io.compress.GzipCodec") save(outputfile) } }
Example 5
Source File: KafkaStructuredStreamingDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.structuredStreaming.kafka import java.sql.Timestamp import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.window object KafkaStructuredStreamingDemo{ def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("KafkaStreamingDemo") .getOrCreate() import spark.implicits._ val df = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", "topic") .load() // 请使用OSS作为Checkpoint存储 val checkpointLocation3 = "oss://bucket/checkpoint3/" val windowedCountsWithWatermark = wordsWithTimestamp .withWatermark("timestamp", "5 seconds") .groupBy( window($"timestamp", "6 seconds", "3 seconds"), $"word" ).count() val query3 = windowedCountsWithWatermark.writeStream .outputMode("append") .format("console") .option("checkpointLocation", checkpointLocation3) .start() query3.awaitTermination() } }