org.apache.spark.sql.functions.struct Scala Examples
The following examples show how to use org.apache.spark.sql.functions.struct.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ConcatArrowAndExplodeSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.timeseries import java.io.ByteArrayOutputStream import java.nio.channels.Channels import java.util.concurrent.TimeUnit import com.twosigma.flint.arrow.ArrowUtils import org.apache.arrow.memory.RootAllocator import org.apache.arrow.vector.ipc.ArrowFileWriter import org.apache.arrow.vector.{ BigIntVector, Float8Vector, VectorSchemaRoot } import org.apache.spark.sql.functions.{ array, col, lit, struct } import org.apache.spark.sql.types._ class ConcatArrowAndExplodeSpec extends TimeSeriesSuite { "ConcatArrowAndExplode" should "work" in { val batchSize = 10 var df = spark.range(1000, 2000, 1000).toDF("time") val columns = (0 until batchSize).map(v => struct((df("time") + v).as("time"), lit(v.toDouble).as("v"))) df = df.withColumn("base_rows", array(columns: _*)) val allocator = new RootAllocator(Long.MaxValue) val schema1 = StructType(Seq(StructField("v1", DoubleType))) val root1 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema1), allocator) val vector1 = root1.getVector("v1").asInstanceOf[Float8Vector] vector1.allocateNew() for (i <- 0 until batchSize) { vector1.set(i, i + 10.0) } vector1.setValueCount(batchSize) val out1 = new ByteArrayOutputStream() val arrowWriter1 = new ArrowFileWriter(root1, null, Channels.newChannel(out1)) arrowWriter1.writeBatch() arrowWriter1.close() root1.close() df = df.withColumn("f1_schema", struct(lit(0.0).as("v1"))) df = df.withColumn("f1_data", lit(out1.toByteArray)) val schema2 = StructType(Seq(StructField("v2", DoubleType), StructField("v3", LongType))) val root2 = VectorSchemaRoot.create(ArrowUtils.toArrowSchema(schema2), allocator) val vector2 = root2.getVector("v2").asInstanceOf[Float8Vector] val vector3 = root2.getVector("v3").asInstanceOf[BigIntVector] vector2.allocateNew() vector3.allocateNew() for (i <- 0 until batchSize) { vector2.set(i, i + 20.0) } vector2.setValueCount(batchSize) for (i <- 0 until batchSize) { vector3.set(i, i + 30L) } vector3.setValueCount(batchSize) val out2 = new ByteArrayOutputStream() val arrowWriter2 = new ArrowFileWriter(root2, null, Channels.newChannel(out2)) arrowWriter2.writeBatch() arrowWriter2.close() root2.close() df = df.withColumn("f2_schema", struct(lit(0.0).as("v2"), lit(0L).as("v3"))) df = df.withColumn("f2_data", lit(out2.toByteArray)) var tsrdd = TimeSeriesRDD.fromDF(df)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) tsrdd = tsrdd.concatArrowAndExplode("base_rows", Seq("f1_schema", "f2_schema"), Seq("f1_data", "f2_data")) tsrdd.toDF.show() var expected = spark.range(1000, 1000 + batchSize).toDF("time") expected = expected.withColumn("v", col("time") - 1000.0) expected = expected.withColumn("v1", col("time") - 1000 + 10.0) expected = expected.withColumn("v2", col("time") - 1000 + 20.0) expected = expected.withColumn("v3", col("time") - 1000 + 30) val expectedTsrdd = TimeSeriesRDD.fromDF(expected)(isSorted = false, timeUnit = TimeUnit.NANOSECONDS) assertEquals(tsrdd, expectedTsrdd) } }
Example 2
Source File: ServingUDFs.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package org.apache.spark.sql.execution.streaming import com.microsoft.ml.spark.io.http.HTTPResponseData import com.microsoft.ml.spark.io.http.HTTPSchema.{binary_to_response, empty_response, string_to_response} import org.apache.spark.sql.execution.streaming.continuous.HTTPSourceStateHolder import org.apache.spark.sql.expressions.UserDefinedFunction import org.apache.spark.sql.functions.{lit, struct, to_json, udf} import org.apache.spark.sql.types._ import org.apache.spark.sql.{Column, Row} import scala.util.Try object ServingUDFs { private def jsonReply(c: Column) = string_to_response(to_json(c)) def makeReplyUDF(data: Column, dt: DataType, code: Column = lit(200), reason: Column = lit("Success")): Column = { dt match { case NullType => empty_response(code, reason) case StringType => string_to_response(data, code, reason) case BinaryType => binary_to_response(data) case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case at: ArrayType => at.elementType match { case _: StructType => jsonReply(data) case _: MapType => jsonReply(data) case _ => jsonReply(struct(data)) } case _ => jsonReply(struct(data)) } } private def sendReplyHelper(mapper: Row => HTTPResponseData)(serviceName: String, reply: Row, id: Row): Boolean = { if (Option(reply).isEmpty || Option(id).isEmpty) { null.asInstanceOf[Boolean] //scalastyle:ignore null } else { Try(HTTPSourceStateHolder.getServer(serviceName).replyTo(id.getString(0), id.getString(1), mapper(reply))) .toOption.isDefined } } def sendReplyUDF: UserDefinedFunction = { val toData = HTTPResponseData.makeFromRowConverter udf(sendReplyHelper(toData) _, BooleanType) } }
Example 3
Source File: VowpalWabbitInteractions.scala From mmlspark with MIT License | 5 votes |
// Copyright (C) Microsoft Corporation. All rights reserved. // Licensed under the MIT License. See LICENSE in project root for information. package com.microsoft.ml.spark.vw import com.microsoft.ml.spark.core.contracts.{HasInputCols, HasOutputCol, Wrappable} import org.apache.spark.ml.{ComplexParamsReadable, ComplexParamsWritable, Transformer} import org.apache.spark.ml.param.ParamMap import org.apache.spark.sql.{DataFrame, Dataset, Row} import org.apache.spark.sql.functions.{col, struct, udf} import org.apache.spark.ml.linalg.{Vector, Vectors} import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.types.{StructField, StructType} import org.apache.spark.ml.linalg.SQLDataTypes.VectorType object VowpalWabbitInteractions extends ComplexParamsReadable[VowpalWabbitInteractions] class VowpalWabbitInteractions(override val uid: String) extends Transformer with HasInputCols with HasOutputCol with HasNumBits with HasSumCollisions with Wrappable with ComplexParamsWritable { def this() = this(Identifiable.randomUID("VowpalWabbitInteractions")) override def transform(dataset: Dataset[_]): DataFrame = { val fieldSubset = dataset.schema.fields .filter(f => getInputCols.contains(f.name)) val mask = getMask val mode = udf((r: Row) => { // compute the final number of features val numElems = (0 until r.length) .map(r.getAs[Vector](_).numNonzeros).product val newIndices = new Array[Int](numElems) val newValues = new Array[Double](numElems) // build interaction features using FNV-1 val fnvPrime = 16777619 var i = 0 def interact(idx: Int, value: Double, ns: Int): Unit = { if (ns == r.size) { newIndices(i) += mask & idx newValues(i) += value i += 1 } else { val idx1 = idx * fnvPrime r.getAs[Vector](ns).foreachActive { case (idx2, value2) => interact(idx1 ^ idx2, value * value2, ns + 1) } } } // start the recursion interact(0, 1, 0) val (indicesSorted, valuesSorted) = VectorUtils.sortAndDistinct(newIndices, newValues, getSumCollisions) Vectors.sparse(1 << getNumBits, indicesSorted, valuesSorted) }) dataset.toDF.withColumn(getOutputCol, mode.apply(struct(fieldSubset.map(f => col(f.name)): _*))) } override def transformSchema(schema: StructType): StructType = { val fieldNames = schema.fields.map(_.name) for (f <- getInputCols) if (!fieldNames.contains(f)) throw new IllegalArgumentException("missing input column " + f) else { val fieldType = schema.fields(schema.fieldIndex(f)).dataType if (fieldType != VectorType) throw new IllegalArgumentException("column " + f + " must be of type Vector but is " + fieldType.typeName) } schema.add(StructField(getOutputCol, VectorType, true)) } override def copy(extra: ParamMap): VowpalWabbitFeaturizer = defaultCopy(extra) }
Example 4
Source File: SchemaJsonExample.scala From spark-schema-registry with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.registry.examples import java.util.UUID import org.apache.spark.sql.SparkSession import org.apache.spark.sql.functions.{from_json, struct, to_json} import org.apache.spark.sql.streaming.{OutputMode, Trigger} import org.apache.spark.sql.types._ object SchemaJsonExample { def main(args: Array[String]): Unit = { val bootstrapServers = if (args.length > 0) args(0) else "localhost:9092" val topic = if (args.length > 1) args(1) else "topic1" val outTopic = if (args.length > 2) args(2) else "topic1-out" val checkpointLocation = if (args.length > 3) args(3) else "/tmp/temporary-" + UUID.randomUUID.toString val spark = SparkSession .builder .appName("SchemaExample") .getOrCreate() val messages = spark .readStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("subscribe", topic) .load() import spark.implicits._ // the schema for truck events val schema = StructType(Seq( StructField("driverId", IntegerType, nullable = false), StructField("truckId", IntegerType, nullable = false), StructField("eventTime", StringType, nullable = false), StructField("eventType", StringType, nullable = false), StructField("longitude", DoubleType, nullable = false), StructField("latitude", DoubleType, nullable = false), StructField("eventKey", StringType, nullable = false), StructField("correlationId", StringType, nullable = false), StructField("driverName", StringType, nullable = false), StructField("routeId", IntegerType, nullable = false), StructField("routeName", StringType, nullable = false), StructField("eventDate", StringType, nullable = false), StructField("miles", IntegerType, nullable = false) )) // read messages from kafka and parse it using the above schema val df = messages .select(from_json($"value".cast("string"), schema).alias("value")) // project (driverId, truckId, miles) for the events where miles > 300 val filtered = df.select($"value.driverId", $"value.truckId", $"value.miles") .where("value.miles > 300") // write the output to a kafka topic serialized as a JSON string. // should produce events like {"driverId":14,"truckId":25,"miles":373} val query = filtered .select(to_json(struct($"*")).alias("value")) .writeStream .format("kafka") .option("kafka.bootstrap.servers", bootstrapServers) .option("topic", outTopic) .option("checkpointLocation", checkpointLocation) .trigger(Trigger.ProcessingTime(10000)) .outputMode(OutputMode.Append()) .start() query.awaitTermination() } }
Example 5
Source File: MapGroupsWithState.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package mapGroupsWithState import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.StringType import spark.SparkHelper import org.apache.spark.sql.streaming.{GroupState, GroupStateTimeout, OutputMode} import radio.{ArtistAggregationState, SimpleSongAggregation, SimpleSongAggregationKafka} object MapGroupsWithState extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def updateArtistStateWithEvent(state: ArtistAggregationState, artistCount : SimpleSongAggregation) = { log.warn("MapGroupsWithState - updateArtistStateWithEvent") if(state.artist == artistCount.artist) { ArtistAggregationState(state.artist, state.count + artistCount.count) } else { state } } def updateAcrossEvents(artist:String, inputs: Iterator[SimpleSongAggregation], oldState: GroupState[ArtistAggregationState]): ArtistAggregationState = { var state: ArtistAggregationState = if (oldState.exists) oldState.get else ArtistAggregationState(artist, 1L) // for every rows, let's count by artist the number of broadcast, instead of counting by artist, title and radio for (input <- inputs) { state = updateArtistStateWithEvent(state, input) oldState.update(state) } state } def write(ds: Dataset[SimpleSongAggregationKafka] ) = { ds.select($"radioCount.title", $"radioCount.artist", $"radioCount.radio", $"radioCount.count") .as[SimpleSongAggregation] .groupByKey(_.artist) .mapGroupsWithState(GroupStateTimeout.NoTimeout)(updateAcrossEvents) //we can control what should be done with the state when no update is received after a timeout. .writeStream .outputMode(OutputMode.Update()) .format("console") .queryName("mapGroupsWithState - counting artist broadcast") .start() } }
Example 6
Source File: KafkaSink.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.streaming.StreamingQuery import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper object KafkaSink extends LazyLogger { private val spark = SparkHelper.getSparkSession() import spark.implicits._ def writeStream(staticInputDS: Dataset[SimpleSongAggregation]) : StreamingQuery = { log.warn("Writing to Kafka") staticInputDS .select(to_json(struct($"*")).cast(StringType).alias("value")) .writeStream .outputMode("update") .format("kafka") .option("kafka.bootstrap.servers", KafkaService.bootstrapServers) .queryName("Kafka - Count number of broadcasts for a title/artist by radio") .option("topic", "test") .start() } def debugStream(staticKafkaInputDS: Dataset[SimpleSongAggregationKafka]) = { staticKafkaInputDS .writeStream .queryName("Debug Stream Kafka") .format("console") .start() } }
Example 7
Source File: KafkaSource.scala From Spark-Structured-Streaming-Examples with Apache License 2.0 | 5 votes |
package kafka import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.functions.{struct, to_json, _} import _root_.log.LazyLogger import org.apache.spark.sql.types.{StringType, _} import radio.{SimpleSongAggregation, SimpleSongAggregationKafka} import spark.SparkHelper def read(startingOption: String = "startingOffsets", partitionsAndOffsets: String = "earliest") : Dataset[SimpleSongAggregationKafka] = { log.warn("Reading from Kafka") spark .readStream .format("kafka") .option("kafka.bootstrap.servers", "localhost:9092") .option("subscribe", KafkaService.topicName) .option("enable.auto.commit", false) // Cannot be set to true in Spark Strucutured Streaming https://spark.apache.org/docs/latest/structured-streaming-kafka-integration.html#kafka-specific-configurations .option("group.id", "Structured-Streaming-Examples") .option("failOnDataLoss", false) // when starting a fresh kafka (default location is temporary (/tmp) and cassandra is not (var/lib)), we have saved different offsets in Cassandra than real offsets in kafka (that contains nothing) .option(startingOption, partitionsAndOffsets) //this only applies when a new query is started and that resuming will always pick up from where the query left off .load() .withColumn(KafkaService.radioStructureName, // nested structure with our json from_json($"value".cast(StringType), KafkaService.schemaOutput) //From binary to JSON object ).as[SimpleSongAggregationKafka] .filter(_.radioCount != null) //TODO find a better way to filter bad json } }