org.apache.spark.sql.functions.expr Scala Examples
The following examples show how to use org.apache.spark.sql.functions.expr.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StreamStreamOuterJoiner.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import knolx.Config._ import knolx.KnolXLogger import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.functions.{col, expr, from_json} import org.apache.spark.sql.types.StructType object StreamStreamOuterJoiner extends App with KnolXLogger { info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Streaming companies Dataframe") val companiesDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", companiesTopic) .load() .select(col("value").cast("string").as("companyName"), col("timestamp").as("companyTradingTime")) .withWatermark("companyTradingTime", "10 seconds") companiesDF.writeStream.format("console").option("truncate", false).start() info("Original Streaming Dataframe") val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType] val stockStreamDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", stocksTopic) .load() .select(from_json(col("value").cast("string"), schema).as("value"), col("timestamp").as("stockInputTime")) .select("value.*", "stockInputTime") .withWatermark("stockInputTime", "10 seconds") info("Filtered Streaming Dataframe") val filteredStockStreamDF = stockStreamDF.join(companiesDF, expr("companyName = stockName AND stockInputTime >= companyTradingTime AND stockInputTime <= companyTradingTime + interval 20 seconds"), joinType = "leftOuter") val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").option("truncate", false).start() info("Waiting for the query to terminate...") filteredStockStreamingQuery.awaitTermination() filteredStockStreamingQuery.stop() }
Example 2
Source File: StreamStreamJoiner.scala From structured-streaming-application with Apache License 2.0 | 5 votes |
package knolx.spark import knolx.Config._ import knolx.KnolXLogger import org.apache.spark.sql.SparkSession import org.apache.spark.sql.catalyst.ScalaReflection import org.apache.spark.sql.functions.{col, expr, from_json} import org.apache.spark.sql.types.StructType object StreamStreamJoiner extends App with KnolXLogger { info("Creating Spark Session") val spark = SparkSession.builder().master(sparkMaster).appName(sparkAppName).getOrCreate() spark.sparkContext.setLogLevel("WARN") info("Streaming companies Dataframe") val companiesDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", companiesTopic) .load() .select(col("value").cast("string").as("companyName"), col("timestamp").as("companyTradingTime")) companiesDF.writeStream.format("console").option("truncate", false).start() info("Original Streaming Dataframe") val schema = ScalaReflection.schemaFor[Stock].dataType.asInstanceOf[StructType] val stockStreamDF = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServer) .option("subscribe", stocksTopic) .load() .select(from_json(col("value").cast("string"), schema).as("value"), col("timestamp").as("stockInputTime")) .select("value.*", "stockInputTime") info("Filtered Streaming Dataframe") val filteredStockStreamDF = stockStreamDF.join(companiesDF, expr("companyName = stockName AND stockInputTime >= companyTradingTime AND stockInputTime <= companyTradingTime + interval 20 seconds")) val filteredStockStreamingQuery = filteredStockStreamDF.writeStream.format("console").option("truncate", false).start() info("Waiting for the query to terminate...") filteredStockStreamingQuery.awaitTermination() filteredStockStreamingQuery.stop() }
Example 3
Source File: Example3_7.scala From LearningSparkV2 with Apache License 2.0 | 5 votes |
package main.scala.chapter3 import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types._ import org.apache.spark.sql.functions.{col, expr} object Example3_7 { def main(args: Array[String]) { val spark = SparkSession .builder .appName("Example-3_7") .getOrCreate() if (args.length <= 0) { println("usage Example3_7 <file path to blogs.json") System.exit(1) } //get the path to the JSON file val jsonFile = args(0) //define our schema as before val schema = StructType(Array(StructField("Id", IntegerType, false), StructField("First", StringType, false), StructField("Last", StringType, false), StructField("Url", StringType, false), StructField("Published", StringType, false), StructField("Hits", IntegerType, false), StructField("Campaigns", ArrayType(StringType), false))) //Create a DataFrame by reading from the JSON file a predefined Schema val blogsDF = spark.read.schema(schema).json(jsonFile) //show the DataFrame schema as output blogsDF.show(truncate = false) // print the schemas print(blogsDF.printSchema) print(blogsDF.schema) // Show columns and expressions blogsDF.select(expr("Hits") * 2).show(2) blogsDF.select(col("Hits") * 2).show(2) blogsDF.select(expr("Hits * 2")).show(2) // show heavy hitters blogsDF.withColumn("Big Hitters", (expr("Hits > 10000"))).show() } }