org.apache.spark.sql.execution.streaming.MemoryStream Scala Examples
The following examples show how to use org.apache.spark.sql.execution.streaming.MemoryStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CustomSinkSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import scala.collection.mutable.ListBuffer import org.scalatest.FunSuite import org.apache.spark._ import org.apache.spark.sql.{Dataset, DataFrame, Encoder, SQLContext} import org.apache.spark.sql.execution.streaming.MemoryStream class CustomSinkSuite extends FunSuite with DataFrameSuiteBase { test("really simple test of the custom sink") { import spark.implicits._ val input = MemoryStream[String] val doubled = input.toDS().map(x => x + " " + x) val formatName = ("com.highperformancespark.examples" + "structuredstreaming.CustomSinkCollectorProvider") val query = doubled.writeStream .queryName("testCustomSinkBasic") .format(formatName) .start() val inputData = List("hi", "holden", "bye", "pandas") input.addData(inputData) assert(query.isActive === true) query.processAllAvailable() assert(query.exception === None) assert(Pandas.results(0) === inputData.map(x => x + " " + x)) } } object Pandas{ val results = new ListBuffer[Seq[String]]() } class CustomSinkCollectorProvider extends ForeachDatasetSinkProvider { override def func(df: DataFrame) { val spark = df.sparkSession import spark.implicits._ Pandas.results += df.as[String].rdd.collect() } }
Example 2
Source File: StreamingKMeansSuite.scala From spark-structured-streaming-ml with Apache License 2.0 | 5 votes |
package com.highperformancespark.examples.structuredstreaming import com.holdenkarau.spark.testing.DataFrameSuiteBase import org.apache.spark.mllib.clustering.{KMeans, KMeansModel} import org.apache.spark.ml.linalg._ import org.apache.spark.sql.{DataFrame, Dataset} import org.apache.spark.sql.execution.streaming.MemoryStream import org.scalatest.FunSuite import org.apache.log4j.{Level, Logger} case class TestRow(features: Vector) class StreamingKMeansSuite extends FunSuite with DataFrameSuiteBase { override def beforeAll(): Unit = { super.beforeAll() Logger.getLogger("org").setLevel(Level.OFF) } test("streaming model with one center should converge to true center") { import spark.implicits._ val k = 1 val dim = 5 val clusterSpread = 0.1 val seed = 63 // TODO: this test is very flaky. The centers do not converge for some // (most?) random seeds val (batches, trueCenters) = StreamingKMeansSuite.generateBatches(100, 80, k, dim, clusterSpread, seed) val inputStream = MemoryStream[TestRow] val ds = inputStream.toDS() val skm = new StreamingKMeans().setK(k).setRandomCenters(dim, 0.01) val query = skm.evilTrain(ds.toDF()) val streamingModels = batches.map { batch => inputStream.addData(batch) query.processAllAvailable() skm.getModel } // TODO: use spark's testing suite streamingModels.last.centers.zip(trueCenters).foreach { case (center, trueCenter) => val centers = center.toArray.mkString(",") val trueCenters = trueCenter.toArray.mkString(",") println(s"${centers} | ${trueCenters}") assert(center.toArray.zip(trueCenter.toArray).forall( x => math.abs(x._1 - x._2) < 0.1)) } query.stop() } def compareBatchAndStreaming( batchModel: KMeansModel, streamingModel: StreamingKMeansModel, validationData: DataFrame): Unit = { assert(batchModel.clusterCenters === streamingModel.centers) // TODO: implement prediction comparison } } object StreamingKMeansSuite { def generateBatches( numPoints: Int, numBatches: Int, k: Int, d: Int, r: Double, seed: Int, initCenters: Array[Vector] = null): (IndexedSeq[IndexedSeq[TestRow]], Array[Vector]) = { val rand = scala.util.Random rand.setSeed(seed) val centers = initCenters match { case null => Array.fill(k)(Vectors.dense(Array.fill(d)(rand.nextGaussian()))) case _ => initCenters } val data = (0 until numBatches).map { i => (0 until numPoints).map { idx => val center = centers(idx % k) val vec = Vectors.dense( Array.tabulate(d)(x => center(x) + rand.nextGaussian() * r)) TestRow(vec) } } (data, centers) } }
Example 3
Source File: EvolvabilitySuiteBase.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import java.io.File import org.apache.spark.sql.delta.actions.{Action, FileAction, SingleAction} import org.apache.hadoop.fs.Path import org.apache.spark.sql.{QueryTest, SparkSession} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.sql.types.StructType import org.apache.spark.util.Utils trait EvolvabilitySuiteBase extends QueryTest with SharedSparkSession { import testImplicits._ protected def testEvolvability(tablePath: String): Unit = { // Check we can load everything from a log checkpoint val deltaLog = DeltaLog.forTable(spark, new Path(tablePath)) val path = deltaLog.dataPath.toString checkDatasetUnorderly( spark.read.format("delta").load(path).select("id", "value").as[(Int, String)], 4 -> "d", 5 -> "e", 6 -> "f") assert(deltaLog.snapshot.metadata.schema === StructType.fromDDL("id INT, value STRING")) assert(deltaLog.snapshot.metadata.partitionSchema === StructType.fromDDL("id INT")) // Check we can load CheckpointMetaData assert(deltaLog.lastCheckpoint === Some(CheckpointMetaData(3, 6L, None))) // Check we can parse all `Action`s in delta files. It doesn't check correctness. deltaLog.getChanges(0L).toList.map(_._2.toList) } } // scalastyle:off def validateData(spark: SparkSession, path: String): Unit = { import org.apache.spark.sql.delta.util.FileNames._ import scala.reflect.runtime.{universe => ru} import spark.implicits._ val mirror = ru.runtimeMirror(this.getClass.getClassLoader) val tpe = ru.typeOf[Action] val clazz = tpe.typeSymbol.asClass assert(clazz.isSealed, s"${classOf[Action]} must be sealed") val deltaLog = DeltaLog.forTable(spark, new Path(path)) val deltas = 0L to deltaLog.snapshot.version val deltaFiles = deltas.map(deltaFile(deltaLog.logPath, _)).map(_.toString) val actionsTypesInLog = spark.read.schema(Action.logSchema).json(deltaFiles: _*) .as[SingleAction] .collect() .map(_.unwrap.getClass.asInstanceOf[Class[_]]) .toSet val allActionTypes = clazz.knownDirectSubclasses .flatMap { case t if t == ru.typeOf[FileAction].typeSymbol => t.asClass.knownDirectSubclasses case t => Set(t) } .map(t => mirror.runtimeClass(t.asClass)) val missingTypes = allActionTypes -- actionsTypesInLog val unknownTypes = actionsTypesInLog -- allActionTypes assert( missingTypes.isEmpty, s"missing types: $missingTypes. " + "Please update EvolveabilitySuite.generateData to include them in the log.") assert( unknownTypes.isEmpty, s"unknown types: $unknownTypes. " + s"Please make sure they inherit ${classOf[Action]} or ${classOf[FileAction]} directly.") } def main(args: Array[String]): Unit = { val spark = SparkSession.builder().master("local[2]").getOrCreate() val path = new File(args(0)) if (path.exists()) { // Don't delete automatically in case the user types a wrong path. // scalastyle:off throwerror throw new AssertionError(s"${path.getCanonicalPath} exists. Please delete it and retry.") // scalastyle:on throwerror } generateData(spark, path.toString) validateData(spark, path.toString) } }
Example 4
Source File: EventMemoryStreamSpec.scala From odsc-west-streaming-trends with GNU General Public License v3.0 | 5 votes |
package com.twilio.open.streaming.trend.discovery import java.nio.charset.StandardCharsets import java.nio.file.Files import com.twilio.open.protocol.Calls.CallEvent import com.twilio.open.streaming.trend.discovery.config.{AppConfig, AppConfiguration} import com.twilio.open.streaming.trend.discovery.streams.EventAggregation import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.sql.{SQLContext, SparkSession} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.Trigger import org.scalatest.{FunSuite, Matchers} import org.slf4j.{Logger, LoggerFactory} import scala.concurrent.duration._ class EventMemoryStreamSpec extends FunSuite with Matchers with SparkSqlTest { val log: Logger = LoggerFactory.getLogger(classOf[EventAggregation]) private val pathToTestScenarios = "src/test/resources/scenarios" lazy val session: SparkSession = sparkSql override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("aggregation-test-app") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.shuffle.partitions", "32") .set("spark.executor.cores", "4") .set("spark.executor.memory", "1g") .set("spark.ui.enabled", "false") .setJars(SparkContext.jarOfClass(classOf[EventAggregation]).toList) } protected val checkpointDir: String = Files.createTempDirectory(appID).toString def appConfigForTest(): AppConfiguration = { val baseConfig = AppConfig("src/test/resources/app.yaml") baseConfig.copy( checkpointPath = checkpointDir ) baseConfig } test("Should aggregate call events") { implicit val sqlContext: SQLContext = session.sqlContext import session.implicits._ val appConfig = appConfigForTest() val scenario = TestHelper.loadScenario[CallEvent](s"$pathToTestScenarios/pdd_events.json") val scenarioIter = scenario.toIterator scenario.nonEmpty shouldBe true val trendDiscoveryApp = new TrendDiscoveryApp(appConfigForTest(), session) val kafkaData = MemoryStream[MockKafkaDataFrame] val processingTimeTrigger = Trigger.ProcessingTime(2.seconds) val eventAggregation = EventAggregation(appConfig) val processingStream = eventAggregation.process(kafkaData.toDF())(session) .writeStream .format("memory") .queryName("calleventaggs") .outputMode(eventAggregation.outputMode) .trigger(processingTimeTrigger) .start() // 22 events kafkaData.addData(scenarioIter.take(11).map(TestHelper.asMockKafkaDataFrame)) processingStream.processAllAvailable() kafkaData.addData(scenarioIter.take(10).map(TestHelper.asMockKafkaDataFrame)) processingStream.processAllAvailable() kafkaData.addData(scenarioIter.take(1).map(TestHelper.asMockKafkaDataFrame)) processingStream.processAllAvailable() val df = session.sql("select * from calleventaggs") df.printSchema() df.show val res = session .sql("select avg(stats.p99) from calleventaggs") .collect() .map { r => r.getAs[Double](0) } .head DiscoveryUtils.round(res) shouldEqual 7.56 processingStream.stop() } }
Example 5
Source File: SparkIngressSpec.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark import scala.collection.immutable.Seq import org.apache.spark.sql.Dataset import org.apache.spark.sql.streaming.OutputMode import org.apache.spark.sql.execution.streaming.MemoryStream import cloudflow.streamlets.StreamletShape import cloudflow.streamlets.avro._ import cloudflow.spark.avro._ import cloudflow.spark.testkit._ import cloudflow.spark.sql.SQLImplicits._ class SparkIngressSpec extends SparkScalaTestSupport { "SparkIngress" should { "produce elements to its outlet" in { val testKit = SparkStreamletTestkit(session) val instance = new MySparkIngress() // setup outlet tap on outlet port val out: SparkOutletTap[Data] = testKit.outletAsTap[Data](instance.out) val run = testKit.run(instance, Seq.empty, Seq(out)) // get processed rows from the run run.totalRows must be(10) // get data from outlet tap val results = out.asCollection(session) // assert results must contain(Data(1, "name1")) } } } // create sparkStreamlet class MySparkIngress extends SparkStreamlet { val out = AvroOutlet[Data]("out", d ⇒ d.id.toString) val shape = StreamletShape(out) override def createLogic() = new SparkStreamletLogic { private def process: Dataset[Data] = { implicit val sqlCtx = session.sqlContext val data = (1 to 10).map(i ⇒ Data(i, s"name$i")) val m = MemoryStream[Data] m.addData(data) m.toDF.as[Data] } override def buildStreamingQueries = { val outStream: Dataset[Data] = process require(outStream.isStreaming, "The Dataset created by an Ingress must be a Streaming Dataset") val query = writeStream(outStream, out, OutputMode.Append) StreamletQueryExecution(query) } } }
Example 6
Source File: TestSparkStreamletContext.scala From cloudflow with Apache License 2.0 | 5 votes |
package cloudflow.spark package testkit import java.nio.file.attribute.FileAttribute import com.typesafe.config._ import scala.reflect.runtime.universe._ import scala.concurrent.duration._ import org.apache.spark.sql.{ Dataset, Encoder, SparkSession } import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.{ OutputMode, StreamingQuery, Trigger } import cloudflow.streamlets._ import org.apache.spark.sql.catalyst.InternalRow class TestSparkStreamletContext(override val streamletRef: String, session: SparkSession, inletTaps: Seq[SparkInletTap[_]], outletTaps: Seq[SparkOutletTap[_]], override val config: Config = ConfigFactory.empty) extends SparkStreamletContext(StreamletDefinition("appId", "appVersion", streamletRef, "streamletClass", List(), List(), config), session) { val ProcessingTimeInterval = 1500.milliseconds override def readStream[In](inPort: CodecInlet[In])(implicit encoder: Encoder[In], typeTag: TypeTag[In]): Dataset[In] = inletTaps .find(_.portName == inPort.name) .map(_.instream.asInstanceOf[MemoryStream[In]].toDF.as[In]) .getOrElse(throw TestContextException(inPort.name, s"Bad test context, could not find source for inlet ${inPort.name}")) override def writeStream[Out](stream: Dataset[Out], outPort: CodecOutlet[Out], outputMode: OutputMode)(implicit encoder: Encoder[Out], typeTag: TypeTag[Out]): StreamingQuery = { // RateSource can only work with a microBatch query because it contains no data at time zero. // Trigger.Once requires data at start to work. val trigger = if (isRateSource(stream)) { Trigger.ProcessingTime(ProcessingTimeInterval) } else { Trigger.Once() } val streamingQuery = outletTaps .find(_.portName == outPort.name) .map { outletTap ⇒ stream.writeStream .outputMode(outputMode) .format("memory") .trigger(trigger) .queryName(outletTap.queryName) .start() } .getOrElse(throw TestContextException(outPort.name, s"Bad test context, could not find destination for outlet ${outPort.name}")) streamingQuery } override def checkpointDir(dirName: String): String = { val fileAttibutes: Array[FileAttribute[_]] = Array() val tmpDir = java.nio.file.Files.createTempDirectory("spark-test", fileAttibutes: _*) tmpDir.toFile.getAbsolutePath } private def isRateSource(stream: Dataset[_]): Boolean = { import org.apache.spark.sql.execution.command.ExplainCommand val explain = ExplainCommand(stream.queryExecution.logical, true) val res = session.sessionState.executePlan(explain).executedPlan.executeCollect() res.exists((row: InternalRow) => row.getString(0).contains("org.apache.spark.sql.execution.streaming.sources.RateStreamProvider")) } } case class TestContextException(portName: String, msg: String) extends RuntimeException(msg)
Example 7
Source File: StreamingTestHelper.scala From spark-acid with Apache License 2.0 | 5 votes |
package com.qubole.spark.hiveacid.streaming import java.io.{File, IOException} import java.util.UUID import com.qubole.spark.hiveacid.TestHelper import org.apache.spark.network.util.JavaUtils import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.streaming.{OutputMode, StreamingQuery} import org.scalatest.concurrent.TimeLimits import org.scalatest.time.SpanSugar class StreamingTestHelper extends TestHelper with TimeLimits { import StreamingTestHelper._ def runStreaming(tableName: String, outputMode: OutputMode, cols: Seq[String], inputRange: Range, options: List[(String, String)] = List.empty): Unit = { val inputData = MemoryStream[Int] val ds = inputData.toDS() val checkpointDir = createCheckpointDir(namePrefix = "stream.checkpoint").getCanonicalPath var query: StreamingQuery = null try { // Starting streaming query val writerDf = ds.map(i => (i*100, i*10, i)) .toDF(cols:_*) .writeStream .format("HiveAcid") .option("table", tableName) .outputMode(outputMode) .option("checkpointLocation", checkpointDir) //.start() query = options.map { option => writerDf.option(option._1, option._2) }.lastOption.getOrElse(writerDf).start() // Adding data for streaming query inputData.addData(inputRange) failAfter(STREAMING_TIMEOUT) { query.processAllAvailable() } } finally { if (query != null) { // Terminating streaming query query.stop() deleteCheckpointDir(checkpointDir) } } } def deleteCheckpointDir(fileStr: String): Unit = { val file = new File(fileStr) if (file != null) { JavaUtils.deleteRecursively(file) } } def createCheckpointDir(root: String = System.getProperty("java.io.tmpdir"), namePrefix: String = "spark"): File = { var attempts = 0 val maxAttempts = MAX_DIR_CREATION_ATTEMPTS var dir: File = null while (dir == null) { attempts += 1 if (attempts > maxAttempts) { throw new IOException("Failed to create a temp directory (under " + root + ") after " + maxAttempts + " attempts!") } try { dir = new File(root, namePrefix + "-" + UUID.randomUUID.toString) if (dir.exists() || !dir.mkdirs()) { dir = null } } catch { case e: SecurityException => dir = null; } } dir.getCanonicalFile } } object StreamingTestHelper extends TestHelper with SpanSugar { val MAX_DIR_CREATION_ATTEMPTS = 10 val STREAMING_TIMEOUT = 60.seconds }
Example 8
Source File: TransactionsFlowUnitTest.scala From kafka-examples with Apache License 2.0 | 5 votes |
package com.cloudera.streaming.refapp import java.sql.Timestamp import org.scalatest.BeforeAndAfter import org.apache.spark.sql.execution.streaming.MemoryStream class TransactionsFlowUnitTest extends UnitTestBase with BeforeAndAfter { import testImplicits._ var transactionsFromStream: MemoryStream[Transaction] = _ var transactiosnFlow: TransactionsFlow = _ before { transactionsFromStream = MemoryStream[Transaction] transactiosnFlow = new TransactionsFlow( spark, statesFromCluster, customersFromCluster, vendorsFromCluster, transactionsFromStream = transactionsFromStream .toDF.withColumn("timestamp", $"event_timestamp".cast("timestamp"))) } test("Valid records are written to the validTransactions output") { val validTransaction = Transaction( transaction_id = "1", customer_id = Some(1), vendor_id = Some(1), event_state = Some("CREATED"), event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"), price = Some("100"), card_type = Some("Credit")) testStream(transactiosnFlow.validTransactions.select('transaction_id, 'customer_id, 'vendor_id, 'event_state, 'event_timestamp, 'price, 'card_type)) ( AddData(transactionsFromStream, validTransaction), CheckAnswer(validTransaction) ) } test("Invalid records are written to the invalidTransactions output") { // Note: transactionsFlow.validTransactions and invalidTransactions contain the fields that we used for internal calculations, e.g. for validation // It enables us to check the internal calculations testStream(transactiosnFlow.invalidTransactions.select('transaction_id, 'valid_card_type)) ( AddData(transactionsFromStream, Transaction( transaction_id = "2", customer_id = Some(1), vendor_id = Some(1), event_state = Some("CREATED"), event_timestamp = Timestamp.valueOf("2018-11-12 09:42:00"), price = Some("100"), card_type = Some("Invalid"))), CheckAnswer(("2", false)) ) } }
Example 9
Source File: StreamingPredictionsSpec.scala From odsc-east-realish-predictions with Apache License 2.0 | 4 votes |
package com.twilio.open.odsc.realish import java.sql.Timestamp import java.time.Instant import java.util.{Random, UUID} import org.apache.spark.SparkConf import org.apache.spark.sql.{Encoders, SQLContext, SparkSession} import org.scalatest.{FunSuite, Matchers} import org.apache.spark.sql.execution.streaming.MemoryStream import org.apache.spark.sql.functions._ import org.apache.spark.sql.streaming.{OutputMode, Trigger} import scala.concurrent.duration._ class StreamingPredictionsSpec extends FunSuite with Matchers with SharedSparkSql { override def conf: SparkConf = { new SparkConf() .setMaster("local[*]") .setAppName("odsc-spark-utils") .set("spark.ui.enabled", "false") .set("spark.app.id", appID) .set("spark.driver.host", "localhost") .set("spark.sql.session.timeZone", "UTC") } final val notRandomRandom = { val generator = new Random generator.setSeed(100L) generator } test("should stream in some mock data for fun") { implicit val spark: SparkSession = sparkSql import spark.implicits._ implicit val sqlContext: SQLContext = spark.sqlContext implicit val metricEncoder = Encoders.product[Metric] val metricData = MemoryStream[Metric] val startingInstant = Instant.now() val backingData = (1 to 10000).map(offset => { val metric = if (offset % 2 == 0) "loss_percentage" else "connect_duration" val nextLoss = notRandomRandom.nextDouble() * notRandomRandom.nextInt(100) Metric( Timestamp.from(startingInstant.minusSeconds(offset)), UUID.randomUUID().toString, metric, value = if (metric == "loss_percentage") nextLoss else notRandomRandom.nextDouble() * notRandomRandom.nextInt(240), countryCode = if (offset % 8 == 0) "US" else "BR", callDirection = if (metric == "loss_percentage") "inbound" else "outbound" ) }) val processingTimeTrigger = Trigger.ProcessingTime(2.seconds) val streamingQuery = metricData.toDF() .withWatermark("timestamp", "2 hours") .groupBy(col("metric"), col("countryCode"), window($"timestamp", "5 minutes")) .agg( min("value") as "min", avg("value") as "mean", max("value") as "max", count("*") as "total" ) .writeStream .format("memory") .queryName("datastream") .outputMode(OutputMode.Append()) .trigger(processingTimeTrigger) .start() metricData.addData(backingData) streamingQuery.processAllAvailable() spark.sql("select * from datastream").show(20, false) val checkChange = spark.sql("select * from datastream") .groupBy("metric","countryCode") .agg( sum("total") as "total", avg("mean") as "mean" ) checkChange.show(20, false) // now can do interesting things with minor back tracking... streamingQuery.stop() } }