org.apache.spark.sql.functions.struct Scala Examples
The following examples show how to use org.apache.spark.sql.functions.struct.
Example 1
Source File: ConcatArrowAndExplodeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import import java.nio.channels.Channels import java.util.concurrent.TimeUnit import com.twosigma.flint.arrow.ArrowUtils import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.ipc.ArrowFileWriter import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot } import org.apache.spark.sql.functions.{ array, col, lit, struct } import org.apache.spark.sql.types._ class ConcatArrowAndExplodeSpec extends TimeSeriesSuite { "ConcatArrowAndExplode" should "work" in { val batchSize = 10 var df = spark.range(1000, 2000, 1000).toDF("time") val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v"))) df = df.withColumn("base_rows", array(columns: _*)) val allocator = new RootAllocator(Long.MaxValue) val schema1 = StructType(Seq(StructField("v1", DoubleType))) val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator) val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector] vector1.allocateNew() for (i <- 0 until batchSize) { vector1.set(i, i + 10.0) } vector1.setValueCount(batchSize) val out1 = new ByteArrayOutputStream() val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1)) arrowWriter1.writeBatch() arrowWriter1.close() root1.close() df = df.withColumn("f1_schema", struct(lit(0.0).as("v1"))) df = df.withColumn("f1_data", lit(out1.toByteArray)) val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType))) val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator) val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector] val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector] vector2.allocateNew() vector3.allocateNew() for (i <- 0 until batchSize) { vector2.set(i, i + 20.0) } vector2.setValueCount(batchSize) for (i <- 0 until batchSize) { vector3.set(i, i + 30L) } vector3.setValueCount(batchSize) val out2 = new ByteArrayOutputStream() val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2)) arrowWriter2.writeBatch() arrowWriter2.close() root2.close() df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3"))) df = df.withColumn("f2_data", lit(out2.toByteArray)) var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data")) var expected = spark.range(1000, 1000 + batchSize).toDF("time") expected = expected.withColumn("v", col("time") - 1000.0) expected = expected.withColumn("v1", col("time") - 1000 + 10.0) expected = expected.withColumn("v2", col("time") - 1000 + 20.0) expected = expected.withColumn("v3", col("time") - 1000 + 30) val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) assertEquals(tsrdd, expectedTsrdd) } }
Example 2
Source File: ServingUDFs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.sql.execution.streaming import import{binary_to_response, empty_response, string_to_response} import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{lit, struct, to_json, udf} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, Row} import scala.util.Try object ServingUDFs { private def jsonReply(c: Column) = string_to_response(to_json(c)) def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = { dt match { case NullType => empty_response(code, reason) case StringType => string_to_response(data, code, reason) case BinaryType => binary_to_response(data) case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case at: ArrayType => at.elementType match { case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case _ => jsonReply(struct(data)) } case _ => jsonReply(struct(data)) } } private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = { if (Option(reply).isEmpty || Option(id).isEmpty) { null.asInstanceOf[Boolean] //scalastyle:ignore null } else { Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply))) .toOption.isDefined } } def sendReplyUDF: UserDefinedFunction = { val toData = HTTPResponseData.makeFromRowConverter udf(sendReplyHelper(toData) _, BooleanType) } }
Example 3
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package import{HasInputCols, HasOutputCol, Wrappable} import{ComplexParamsReadable, ComplexParamsWritable, Transformer} import import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import{Vector, Vectors} import import org.apache.spark.sql.types.{StructField, StructType} import object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains( val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct( => col( _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 4
Source File: SchemaJsonExample.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.examples import java.util.UUID import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{from_json, struct, to_json} import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.types._ object SchemaJsonExample { def main(args: Array[String]): Unit = { val bootstrapServers = if (args.length > 0) args(0) else "localhost:9092" val topic = if (args.length > 1) args(1) else "topic1" val outTopic = if (args.length > 2) args(2) else "topic1-out" val checkpointLocation = if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString val spark = SparkSession .builder .appName("SchemaExample") .getOrCreate() val messages = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("subscribe", topic) .load() import spark.implicits._ // the schema for truck events val schema = StructType(Seq( StructField("driverId", IntegerType, nullable = false), StructField("truckId", IntegerType, nullable = false), StructField("eventTime", StringType, nullable = false), StructField("eventType", StringType, nullable = false), StructField("longitude", DoubleType, nullable = false), StructField("latitude", DoubleType, nullable = false), StructField("eventKey", StringType, nullable = false), StructField("correlationId", StringType, nullable = false), StructField("driverName", StringType, nullable = false), StructField("routeId", IntegerType, nullable = false), StructField("routeName", StringType, nullable = false), StructField("eventDate", StringType, nullable = false), StructField("miles", IntegerType, nullable = false) )) // read messages from kafka and parse it using the above schema val df = messages .select(from_json($"value".cast("string"), schema).alias("value")) // project (driverId, truckId, miles) for the events where miles > 300 val filtered =$"value.driverId", $"value.truckId", $"value.miles") .where("value.miles > 300") // write the output to a kafka topic serialized as a JSON string. // should produce events like {"driverId":14,"truckId":25,"miles":373} val query = filtered .select(to_json(struct($"*")).alias("value")) .writeStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("topic", outTopic) .option("checkpointLocation", checkpointLocation) .trigger(Trigger.ProcessingTime(10000)) .outputMode(OutputMode.Append()) .start() query.awaitTermination() } }
Example 5
Source File: MapGroupsWithState.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package mapGroupsWithState import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.StringType import spark.SparkHelper import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode} import radio.{ArtistAggregationState, SimpleSongAggregation, SimpleSongAggregationKafka} object MapGroupsWithState extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def updateArtistStateWithEvent(state: ArtistAggregationState, artistCount : SimpleSongAggregation) = { log.warn("MapGroupsWithState - updateArtistStateWithEvent") if(state.artist == artistCount.artist) { ArtistAggregationState(state.artist, state.count + artistCount.count) } else { state } } def updateAcrossEvents(artist:String, inputs: Iterator[SimpleSongAggregation], oldState: GroupState[ArtistAggregationState]): ArtistAggregationState = { var state: ArtistAggregationState = if (oldState.exists) oldState.get else ArtistAggregationState(artist, 1L) // for every rows, let's count by artist the number of broadcast, instead of counting by artist, title and radio for (input <- inputs) { state = updateArtistStateWithEvent(state, input) oldState.update(state) } state } def write(ds: Dataset[SimpleSongAggregationKafka] ) = {$"radioCount.title", $"radioCount.artist", $"", $"radioCount.count") .as[SimpleSongAggregation] .groupByKey(_.artist) .mapGroupsWithState(GroupStateTimeout.NoTimeout)(updateAcrossEvents) //we can control what should be done with the state when no update is received after a timeout. .writeStream .outputMode(OutputMode.Update()) .format("console") .queryName("mapGroupsWithState - counting artist broadcast") .start() } }
Example 6
Source File: KafkaSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.streaming.StreamingQuery import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper object KafkaSink extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def writeStream(staticInputDS: Dataset[SimpleSongAggregation]) : StreamingQuery = { log.warn("Writing to Kafka") staticInputDS .select(to_json(struct($"*")).cast(StringType).alias("value")) .writeStream .outputMode("update") .format("kafka") .option("kafka.bootstrap.servers", KafkaService.bootstrapServers) .queryName("Kafka - Count number of broadcasts for a title/artist by radio") .option("topic", "test") .start() } def debugStream(staticKafkaInputDS: Dataset[SimpleSongAggregationKafka]) = { staticKafkaInputDS .writeStream .queryName("Debug Stream Kafka") .format("console") .start() } }
Example 7
Source File: KafkaSource.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper def read(startingOption: String = "startingOffsets", partitionsAndOffsets: String = "earliest") : Dataset[SimpleSongAggregationKafka] = { log.warn("Reading from Kafka") spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", KafkaService.topicName) .option("", false) // Cannot be set to true in Spark Strucutured Streaming .option("", "Structured-Streaming-Examples") .option("failOnDataLoss", false) // when starting a fresh kafka (default location is temporary (/tmp) and cassandra is not (var/lib)), we have saved different offsets in Cassandra than real offsets in kafka (that contains nothing) .option(startingOption, partitionsAndOffsets) //this only applies when a new query is started and that resuming will always pick up from where the query left off .load() .withColumn(KafkaService.radioStructureName, // nested structure with our json from_json($"value".cast(StringType), KafkaService.schemaOutput) //From binary to JSON object ).as[SimpleSongAggregationKafka] .filter(_.radioCount != null) //TODO find a better way to filter bad json } }