org.apache.spark.streaming.dstream.DStream Scala Examples
The following examples show how to use org.apache.spark.streaming.dstream.DStream.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SocketTextSource.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming.sources import org.apache.spark.sql.SQLContext import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.sources.{BaseRelation, SchemaRelationProvider} import org.apache.spark.sql.streaming.StreamPlan import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.dstream.DStream class SocketTextSource extends SchemaRelationProvider { override def createRelation( sqlContext: SQLContext, parameters: Map[String, String], schema: StructType): BaseRelation = { require(parameters.contains("host") && parameters.contains("port") && parameters.contains("messageToRow")) val messageToRow = { try { val clz = Class.forName(parameters("messageToRow")) clz.newInstance().asInstanceOf[MessageToRowConverter] } catch { case e: Exception => sys.error(s"Failed to load class : ${e.toString}") } } new SocketTextRelation( parameters("host"), parameters("port").toInt, messageToRow, schema, sqlContext) } } case class SocketTextRelation( host: String, port: Int, messageToRowConverter: MessageToRowConverter, val schema: StructType, @transient val sqlContext: SQLContext) extends StreamBaseRelation with StreamPlan { // Currently only support Kafka with String messages @transient private val socketStream = streamSqlContext.streamingContext.socketTextStream( host, port) @transient val stream: DStream[InternalRow] = socketStream.map(messageToRowConverter.toRow(_, schema)) }
Example 2
Source File: TestOutputStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{IOException, ObjectInputStream} import java.util.concurrent.ConcurrentLinkedQueue import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} import org.apache.spark.util.Utils class TestOutputStream[T: ClassTag](parent: DStream[T], val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]()) extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { val collected = rdd.collect() output.add(collected) }, false) { // This is to clear the output buffer every it is read from a checkpoint @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { ois.defaultReadObject() output.clear() } }
Example 3
Source File: JavaTestUtils.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.reflect.ClassTag import java.util.{List => JList} import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStreamLike, JavaDStream, JavaStreamingContext} import org.apache.spark.streaming._ import java.util.ArrayList import collection.JavaConversions._ import org.apache.spark.api.java.JavaRDDLike import org.apache.spark.streaming.dstream.DStream def runStreamsWithPartitions[V](ssc: JavaStreamingContext, numBatches: Int, numExpectedOutput: Int): JList[JList[JList[V]]] = { implicit val cm: ClassTag[V] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]] val res = runStreamsWithPartitions[V](ssc.ssc, numBatches, numExpectedOutput) val out = new ArrayList[JList[JList[V]]]() res.map{entry => val lists = entry.map(new ArrayList[V](_)) out.append(new ArrayList[JList[V]](lists)) } out } } object JavaTestUtils extends JavaTestBase { override def maxWaitTimeMillis = 20000 } object JavaCheckpointTestUtils extends JavaTestBase { override def actuallyWait = true }
Example 4
Source File: TestOutputStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{IOException, ObjectInputStream} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} import org.apache.spark.util.Utils import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class TestOutputStream[T: ClassTag](parent: DStream[T], val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { val collected = rdd.collect() output += collected }) { // This is to clear the output buffer every it is read from a checkpoint //这是清除输出缓冲区,它是从一个检查点读取 @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { ois.defaultReadObject() output.clear() } }
Example 5
Source File: WeatherDataStream.scala From spark-scala with Creative Commons Zero v1.0 Universal | 5 votes |
package com.supergloo import com.killrweather.data.Weather.RawWeatherData import kafka.serializer.StringDecoder import org.apache.log4j.Logger import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka.KafkaUtils parsedWeatherStream.map { weather => (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip) }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip) } def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = { val parsedWeatherStream = rawWeatherStream.map(_._2.split(",")) .map(RawWeatherData(_)) parsedWeatherStream } }
Example 6
Source File: JavaTestUtils.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer} import scala.reflect.ClassTag import java.util.{List => JList} import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStreamLike, JavaDStream, JavaStreamingContext} import org.apache.spark.streaming._ import java.util.ArrayList import collection.JavaConversions._ import org.apache.spark.api.java.JavaRDDLike import org.apache.spark.streaming.dstream.DStream def runStreamsWithPartitions[V](ssc: JavaStreamingContext, numBatches: Int, numExpectedOutput: Int): JList[JList[JList[V]]] = { implicit val cm: ClassTag[V] = implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]] val res = runStreamsWithPartitions[V](ssc.ssc, numBatches, numExpectedOutput) val out = new ArrayList[JList[JList[V]]]() res.map{entry => val lists = entry.map(new ArrayList[V](_)) out.append(new ArrayList[JList[V]](lists)) } out } } object JavaTestUtils extends JavaTestBase { override def maxWaitTimeMillis = 20000 } object JavaCheckpointTestUtils extends JavaTestBase { override def actuallyWait = true }
Example 7
Source File: MQTTUtils.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.mqtt import scala.reflect.ClassTag import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream} import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream} object MQTTUtils { def createStream( jssc: JavaStreamingContext, brokerUrl: String, topic: String, storageLevel: StorageLevel ): JavaReceiverInputDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, brokerUrl, topic, storageLevel) } }
Example 8
Source File: TestOutputStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{IOException, ObjectInputStream} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} import org.apache.spark.util.Utils import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class TestOutputStream[T: ClassTag](parent: DStream[T], val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { val collected = rdd.collect() output += collected }) { // This is to clear the output buffer every it is read from a checkpoint @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { ois.defaultReadObject() output.clear() } }
Example 9
Source File: TestOutputStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{IOException, ObjectInputStream} import java.util.concurrent.ConcurrentLinkedQueue import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} import org.apache.spark.util.Utils class TestOutputStream[T: ClassTag](parent: DStream[T], val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]()) extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { val collected = rdd.collect() output.add(collected) }, false) { // This is to clear the output buffer every it is read from a checkpoint @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { ois.defaultReadObject() output.clear() } }
Example 10
Source File: StreamPlan.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] object StreamPlan { val currentContext = new ThreadLocal[StreamSQLContext]() } trait StreamPlan { protected var validTime: Time = null def streamSqlContext = StreamPlan.currentContext.get() def stream: DStream[InternalRow] def setValidTime(time: Time): Unit = { validTime = time } }
Example 11
Source File: TestOutputStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{IOException, ObjectInputStream} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} import org.apache.spark.util.Utils import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag class TestOutputStream[T: ClassTag](parent: DStream[T], val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]()) extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { val collected = rdd.collect() output += collected }, false) { // This is to clear the output buffer every it is read from a checkpoint @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { ois.defaultReadObject() output.clear() } }
Example 12
Source File: ExistingDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.rdd.{EmptyRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow]) extends SparkPlan with StreamPlan { def children = Nil override def doExecute() = { assert(validTime != null) Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime)) .asInstanceOf[Option[RDD[InternalRow]]] .getOrElse(new EmptyRDD[InternalRow](sparkContext)) } }
Example 13
Source File: SavingStream.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService} import com.kakao.mango.text.ThreadSafeDateFormat import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream import java.util.concurrent.{Future => JFuture} import scala.reflect.runtime.universe.TypeTag object SavingStream { val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd") val hh = ThreadSafeDateFormat("HH") val mm = ThreadSafeDateFormat("mm") val m0 = (ms: Long) => mm(ms).charAt(0) + "0" } @transient var executor: RichExecutorService = _ def ex: RichExecutorService = { if (executor == null) { this.synchronized { if (executor == null) { executor = new RichExecutorService(es.get()) } } } executor } def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = { stream.foreachRDD { (rdd, time) => ex.submit { toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*) } } } def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms)) } } def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms)) } } def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms)) } } def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms)) } } } class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) { override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd) } class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) { override def toDF(rdd: RDD[String]) = ctx.read.json(rdd) } class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) { import com.kakao.mango.json._ override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson)) } class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) { override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema) }
Example 14
Source File: ScalaCheckStreamingTest.scala From sscheck with Apache License 2.0 | 5 votes |
package es.ucm.fdi.sscheck.spark.streaming import org.junit.runner.RunWith import org.specs2.runner.JUnitRunner import org.specs2.ScalaCheck import org.specs2.execute.{AsResult, Result} import org.scalacheck.{Prop, Gen} import org.scalacheck.Arbitrary.arbitrary import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration} import org.apache.spark.streaming.dstream.DStream import es.ucm.fdi.sscheck.prop.tl.Formula._ import es.ucm.fdi.sscheck.prop.tl.DStreamTLProperty import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._ @RunWith(classOf[JUnitRunner]) class ScalaCheckStreamingTest extends org.specs2.Specification with DStreamTLProperty with org.specs2.matcher.ResultMatchers with ScalaCheck { override def sparkMaster : String = "local[5]" override def batchDuration = Duration(350) override def defaultParallelism = 4 def is = sequential ^ s2""" Simple properties for Spark Streaming - where the first property is a success $prop1 - where a simple property for DStream.count is a success ${countProp(_.count)} - where a faulty implementation of the DStream.count is detected ${countProp(faultyCount) must beFailing} """ def prop1 = { val batchSize = 30 val numBatches = 10 val dsgenSeqSeq1 = { val zeroSeqSeq = Gen.listOfN(numBatches, Gen.listOfN(batchSize, 0)) val oneSeqSeq = Gen.listOfN(numBatches, Gen.listOfN(batchSize, 1)) Gen.oneOf(zeroSeqSeq, oneSeqSeq) } type U = (RDD[Int], RDD[Int]) forAllDStream[Int, Int]( "inputDStream" |: dsgenSeqSeq1)( (inputDs : DStream[Int]) => { val transformedDs = inputDs.map(_+1) transformedDs })(always ((u : U) => { val (inputBatch, transBatch) = u inputBatch.count === batchSize and inputBatch.count === transBatch.count and (inputBatch.intersection(transBatch).isEmpty should beTrue) and ( inputBatch should foreachRecord(_ == 0) or (inputBatch should foreachRecord(_ == 1)) ) }) during numBatches )}.set(minTestsOk = 10).verbose def faultyCount(ds : DStream[Double]) : DStream[Long] = ds.count.transform(_.map(_ - 1)) def countProp(testSubject : DStream[Double] => DStream[Long]) = { type U = (RDD[Double], RDD[Long]) val numBatches = 10 forAllDStream[Double, Long]( Gen.listOfN(numBatches, Gen.listOfN(30, arbitrary[Double])))( testSubject )(always ((u : U) => { val (inputBatch, transBatch) = u transBatch.count === 1 and inputBatch.count === transBatch.first }) during numBatches )}.set(minTestsOk = 10).verbose }
Example 15
Source File: SimpleStreamingFormulas.scala From sscheck with Apache License 2.0 | 5 votes |
package es.ucm.fdi.sscheck.spark.simple import org.junit.runner.RunWith import org.specs2.runner.JUnitRunner import org.specs2.matcher.ResultMatchers import org.scalacheck.Arbitrary.arbitrary import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Duration import org.apache.spark.streaming.dstream.DStream import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty} import es.ucm.fdi.sscheck.prop.tl.Formula._ import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._ import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen} import org.scalacheck.Gen import es.ucm.fdi.sscheck.gen.PDStream import es.ucm.fdi.sscheck.gen.Batch @RunWith(classOf[JUnitRunner]) class SimpleStreamingFormulas extends org.specs2.Specification with DStreamTLProperty with org.specs2.ScalaCheck { // Spark configuration override def sparkMaster : String = "local[*]" override def batchDuration = Duration(50) override def defaultParallelism = 4 def is = sequential ^ s2""" Simple demo Specs2 example for ScalaCheck properties with temporal formulas on Spark Streaming programs - Given a stream of integers When we filter out negative numbers Then we get only numbers greater or equal to zero $filterOutNegativeGetGeqZero - where time increments for each batch $timeIncreasesMonotonically """ def filterOutNegativeGetGeqZero = { type U = (RDD[Int], RDD[Int]) val numBatches = 10 val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Int]), numBatches) val formula = always(nowTime[U]{ (letter, time) => val (_input, output) = letter output should foreachRecord {_ >= 0} }) during numBatches forAllDStream( gen)( _.filter{ x => !(x < 0)})( formula) }.set(minTestsOk = 50).verbose def timeIncreasesMonotonically = { type U = (RDD[Int], RDD[Int]) val numBatches = 10 val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Int])) val formula = always(nextTime[U]{ (letter, time) => nowTime[U]{ (nextLetter, nextTime) => time.millis <= nextTime.millis } }) during numBatches-1 forAllDStream( gen)( identity[DStream[Int]])( formula) }.set(minTestsOk = 10).verbose }
Example 16
Source File: StreamingFormulaDemo2.scala From sscheck with Apache License 2.0 | 5 votes |
package es.ucm.fdi.sscheck.spark.demo import org.junit.runner.RunWith import org.specs2.runner.JUnitRunner import org.specs2.ScalaCheck import org.specs2.Specification import org.specs2.matcher.ResultMatchers import org.scalacheck.Arbitrary.arbitrary import org.scalacheck.Gen import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Duration import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.DStream._ import scalaz.syntax.std.boolean._ import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty} import es.ucm.fdi.sscheck.prop.tl.Formula._ import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen} import es.ucm.fdi.sscheck.gen.BatchGenConversions._ import es.ucm.fdi.sscheck.gen.PDStreamGenConversions._ import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._ @RunWith(classOf[JUnitRunner]) class StreamingFormulaDemo2 extends Specification with DStreamTLProperty with ResultMatchers with ScalaCheck { // Spark configuration override def sparkMaster : String = "local[*]" override def batchDuration = Duration(300) override def defaultParallelism = 3 override def enableCheckpointing = true def is = sequential ^ s2""" Check process to persistently detect and ban bad users - where a stateful implementation extracts the banned users correctly ${checkExtractBannedUsersList(listBannedUsers)} - where a trivial implementation ${checkExtractBannedUsersList(statelessListBannedUsers) must beFailing} """ type UserId = Long def listBannedUsers(ds : DStream[(UserId, Boolean)]) : DStream[UserId] = ds.updateStateByKey((flags : Seq[Boolean], maybeFlagged : Option[Unit]) => maybeFlagged match { case Some(_) => maybeFlagged case None => flags.contains(false) option {()} } ).transform(_.keys) def statelessListBannedUsers(ds : DStream[(UserId, Boolean)]) : DStream[UserId] = ds.map(_._1) def checkExtractBannedUsersList(testSubject : DStream[(UserId, Boolean)] => DStream[UserId]) = { val batchSize = 20 val (headTimeout, tailTimeout, nestedTimeout) = (10, 10, 5) val (badId, ids) = (15L, Gen.choose(1L, 50L)) val goodBatch = BatchGen.ofN(batchSize, ids.map((_, true))) val badBatch = goodBatch + BatchGen.ofN(1, (badId, false)) val gen = BatchGen.until(goodBatch, badBatch, headTimeout) ++ BatchGen.always(Gen.oneOf(goodBatch, badBatch), tailTimeout) type U = (RDD[(UserId, Boolean)], RDD[UserId]) val (inBatch, outBatch) = ((_ : U)._1, (_ : U)._2) val formula = { val badInput = at(inBatch)(_ should existsRecord(_ == (badId, false))) val allGoodInputs = at(inBatch)(_ should foreachRecord(_._2 == true)) val noIdBanned = at(outBatch)(_.isEmpty) val badIdBanned = at(outBatch)(_ should existsRecord(_ == badId)) ( ( allGoodInputs and noIdBanned ) until badIdBanned on headTimeout ) and ( always { badInput ==> (always(badIdBanned) during nestedTimeout) } during tailTimeout ) } forAllDStream( gen)( testSubject)( formula) }.set(minTestsOk = 10).verbose }
Example 17
Source File: StreamingFormulaDemo1.scala From sscheck with Apache License 2.0 | 5 votes |
package es.ucm.fdi.sscheck.spark.demo import org.junit.runner.RunWith import org.specs2.runner.JUnitRunner import org.specs2.ScalaCheck import org.specs2.Specification import org.specs2.matcher.ResultMatchers import org.scalacheck.Arbitrary.arbitrary import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Duration import org.apache.spark.streaming.dstream.DStream import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty} import es.ucm.fdi.sscheck.prop.tl.Formula._ import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen} @RunWith(classOf[JUnitRunner]) class StreamingFormulaDemo1 extends Specification with DStreamTLProperty with ResultMatchers with ScalaCheck { // Spark configuration override def sparkMaster : String = "local[*]" override def batchDuration = Duration(150) override def defaultParallelism = 4 def is = sequential ^ s2""" Simple demo Specs2 example for ScalaCheck properties with temporal formulas on Spark Streaming programs - where a simple property for DStream.count is a success ${countForallAlwaysProp(_.count)} - where a faulty implementation of the DStream.count is detected ${countForallAlwaysProp(faultyCount) must beFailing} """ def faultyCount(ds : DStream[Double]) : DStream[Long] = ds.count.transform(_.map(_ - 1)) def countForallAlwaysProp(testSubject : DStream[Double] => DStream[Long]) = { type U = (RDD[Double], RDD[Long]) val (inBatch, transBatch) = ((_ : U)._1, (_ : U)._2) val numBatches = 10 val formula : Formula[U] = always { (u : U) => transBatch(u).count === 1 and inBatch(u).count === transBatch(u).first } during numBatches val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Double]), numBatches) forAllDStream( gen)( testSubject)( formula) }.set(minTestsOk = 10).verbose }
Example 18
Source File: TestOutputStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{IOException, ObjectInputStream} import java.util.concurrent.ConcurrentLinkedQueue import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} import org.apache.spark.util.Utils class TestOutputStream[T: ClassTag](parent: DStream[T], val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]()) extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { val collected = rdd.collect() output.add(collected) }, false) { // This is to clear the output buffer every it is read from a checkpoint @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { ois.defaultReadObject() output.clear() } }
Example 19
Source File: StreamingActionBase.scala From spark-testing-base with Apache License 2.0 | 5 votes |
package com.holdenkarau.spark.testing import org.apache.spark.streaming.TestStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.scheduler.{ StreamingListenerBatchCompleted, StreamingListener} import org.apache.spark.streaming.util.TestManualClock import org.scalatest.Suite import scala.reflect.ClassTag def runAction[U: ClassTag](input: Seq[Seq[U]], operation: DStream[U] => Unit) { val numBatches_ = input.size withStreamingContext(setupStream[U](input, operation)) { ssc => runActionStream(ssc, numBatches_) } } private def withStreamingContext(outputStreamSSC: TestStreamingContext) (block: TestStreamingContext => Unit): Unit = { try { block(outputStreamSSC) } finally { try { outputStreamSSC.stop(stopSparkContext = false) } catch { case e: Throwable => logError("Error stopping StreamingContext", e) } } } private def setupStream[U: ClassTag](input: Seq[Seq[U]], operation: DStream[U] => Any): TestStreamingContext = { // Create TestStreamingContext val ssc = new TestStreamingContext(sc, batchDuration) ssc.addStreamingListener(batchCountListener) if (checkpointDir != null) { ssc.checkpoint(checkpointDir) } // Setup the stream computation val inputStream = createTestInputStream(sc, ssc, input) operation(inputStream) ssc } private def runActionStream(ssc: TestStreamingContext, numBatches: Int) { assert(numBatches > 0, "Number of batches to run stream computation is zero") batchCountListener.batchCount = 0 // Start computation ssc.start() // Advance manual clock val clock = ssc.getScheduler().clock.asInstanceOf[TestManualClock] logInfo("Manual clock before advancing = " + clock.currentTime()) if (actuallyWait) { for (i <- 1 to numBatches) { logInfo("Actually waiting for " + batchDuration) clock.addToTime(batchDuration.milliseconds) Thread.sleep(batchDuration.milliseconds) } } else { clock.addToTime(numBatches * batchDuration.milliseconds) } logInfo("Manual clock after advancing = " + clock.currentTime()) // wait for expected number of batches to execute val startTime = System.currentTimeMillis() while (batchCountListener.batchCount < numBatches && System.currentTimeMillis() - startTime < maxWaitTimeMillis) { logInfo(s"batches: run = ${batchCountListener.batchCount} " + s"target = ${numBatches}") ssc.awaitTerminationOrTimeout(50) } val timeTaken = System.currentTimeMillis() - startTime logInfo("Output generated in " + timeTaken + " milliseconds") Thread.sleep(100) // Give some time for the forgetting old RDDs to complete } } class BatchCountListener extends StreamingListener { var batchCount = 0 override def onBatchCompleted( batchCompleted: StreamingListenerBatchCompleted): Unit = { batchCount = batchCount + 1 } }
Example 20
Source File: RabbitMQDistributedInput.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.input.rabbitmq import java.io.{Serializable => JSerializable} import com.stratio.sparta.plugin.input.rabbitmq.handler.MessageHandler import com.stratio.sparta.sdk.pipeline.input.Input import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import org.apache.spark.sql.Row import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.rabbitmq.RabbitMQUtils._ import org.apache.spark.streaming.rabbitmq.distributed.RabbitMQDistributedKey import org.apache.spark.streaming.rabbitmq.models.ExchangeAndRouting import scala.language.implicitConversions import scala.util.Try object RabbitMQDistributedInput { //Keys from UI val DistributedPropertyKey = "distributedProperties" val QueuePropertyKey = "distributedQueue" val ExchangeNamePropertyKey = "distributedExchangeName" val ExchangeTypePropertyKey = "distributedExchangeType" val RoutingKeysPropertyKey = "distributedRoutingKeys" val HostPropertyKey = "hosts" //Default values val QueueDefaultValue = "queue" val HostDefaultValue = "localhost" } class RabbitMQDistributedInput(properties: Map[String, JSerializable]) extends Input(properties) with RabbitMQGenericProps { import RabbitMQDistributedInput._ def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = { val messageHandler = MessageHandler(properties).handler val params = propsWithStorageLevel(sparkStorageLevel) createDistributedStream(ssc, getKeys(params), params, messageHandler) } def getKeys(rabbitMQParams: Map[String, String]): Seq[RabbitMQDistributedKey] = { val items = Try(properties.getMapFromJsoneyString(DistributedPropertyKey)) .getOrElse(Seq.empty[Map[String, String]]) for (item <- items) yield getKey(item, rabbitMQParams) } def getKey(params: Map[String, String], rabbitMQParams: Map[String, String]): RabbitMQDistributedKey = { val exchangeAndRouting = ExchangeAndRouting( params.get(ExchangeNamePropertyKey).notBlank, params.get(ExchangeTypePropertyKey).notBlank, params.get(RoutingKeysPropertyKey).notBlank ) val hosts = HostPropertyKey -> params.get(HostPropertyKey).notBlankWithDefault(HostDefaultValue) val queueName = params.get(QueuePropertyKey).notBlankWithDefault(QueueDefaultValue) RabbitMQDistributedKey( queueName, exchangeAndRouting, rabbitMQParams + hosts ) } }
Example 21
Source File: WordCount.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.streaming.application import com.intel.hibench.common.streaming.UserVisitParser import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{StateSpec, State} class WordCount() extends BenchBase { override def process(lines: DStream[(Long, String)], config: SparkBenchConfig) = { val reportTopic = config.reporterTopic val brokerList = config.brokerList // Project Line to UserVisit, the output means "[IP, [Strat Time, Count]]" val parsedLine: DStream[(String, (Long, Int))] = lines.map(line => { val userVisit = UserVisitParser.parse(line._2) (userVisit.getIp, (line._1, 1)) }) // Define state mapping function val mappingFunc = (ip: String, one: Option[(Long, Int)], state: State[Int]) => { if (!one.isDefined) { throw new Exception("input value is not defined. It should not happen as we don't use timeout function.") } val sum = one.get._2 + state.getOption.getOrElse(0) state.update(sum) (ip, one.get._1) } val wordCount = parsedLine.mapWithState(StateSpec.function(mappingFunc)) wordCount.foreachRDD(rdd => rdd.foreachPartition(partLines => { val reporter = new KafkaReporter(reportTopic, brokerList) partLines.foreach { case (word, inTime) => val outTime = System.currentTimeMillis() reporter.report(inTime, outTime) if (config.debugMode) println(word + ": " + inTime + ", " + outTime ) } })) } }
Example 22
Source File: SolRSupport.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.cloudera.sa.taxi360.streaming.ingestion.solr import java.net.{ConnectException, SocketException} import java.util import org.apache.solr.client.solrj.impl.CloudSolrServer import org.apache.solr.client.solrj.request.UpdateRequest import org.apache.solr.common.{SolrException, SolrInputDocument} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream object SolRSupport { def indexDStreamOfDocs(zkHost:String, collection:String, batchSize:Int, docDStream:DStream[SolrInputDocument]): Unit ={ docDStream.foreachRDD(docRdd => { indexDoc(zkHost, collection, batchSize, docRdd) }) } def indexDoc(zkHost:String, collection:String, batchSize:Int, docRdd:RDD[SolrInputDocument]): Unit = { docRdd.foreachPartition(it => { val solrServer = CloudSolRServerBuilder.build(zkHost) val batch = new util.ArrayList[SolrInputDocument]() while (it.hasNext) { val inputDoc = it.next() batch.add(inputDoc) if (batch.size() >= batchSize) sendBatchToSolr(solrServer, collection, batch) } if (!batch.isEmpty()) sendBatchToSolr(solrServer, collection, batch) }) } def sendBatchToSolr( solrServer: CloudSolrServer, collection:String, batch:util.Collection[SolrInputDocument]) { val req = new UpdateRequest() req.setParam("collection", collection) req.add(batch) try { solrServer.request(req) } catch { case e:Exception => { if (shouldRetry(e)) { try { Thread.sleep(2000) } catch { case e1: InterruptedException => { Thread.interrupted() } } try { solrServer.request(req) } catch { case e1: Exception => { if (e1.isInstanceOf[RuntimeException]) { throw e1.asInstanceOf[RuntimeException] } else { throw new RuntimeException(e1) } } } } else { if (e.isInstanceOf[RuntimeException]) { throw e.asInstanceOf[RuntimeException] } else { throw new RuntimeException(e) } } } } finally { batch.clear() } } def shouldRetry( exc:Exception): Boolean = { val rootCause = SolrException.getRootCause(exc) rootCause.isInstanceOf[ConnectException] || rootCause.isInstanceOf[SocketException] } }
Example 23
Source File: KafkaStreamingDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.kafka import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} import org.apache.spark.streaming.{Seconds, StreamingContext} object KafkaStreamingDemo { def main(args: Array[String]): Unit = { val spark = SparkSession .builder() .appName("KafkaStreamingDemo") .getOrCreate() val ssc = new StreamingContext(spark.sparkContext, Seconds(5)) // 请使用OSS作为Checkpoint存储 ssc.checkpoint("oss://bucket/checkpointDir/") // kafka配置参数 val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "192.168.1.1:9200,192.168.1.2:9200,192.168.1.3:9200", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "testGroupId", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) val topics = Set("event_topic") val recordDstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) ) val dstream = recordDstream.map(f => (f.key(), f.value())) val data: DStream[String] = dstream.map(_._2) val wordsDStream: DStream[String] = data.flatMap(_.split(" ")) val wordAndOneDstream: DStream[(String, Int)] = wordsDStream.map((_, 1)) val result: DStream[(String, Int)] = wordAndOneDstream.reduceByKey(_ + _) result.print() ssc.start() ssc.awaitTermination() } }
Example 24
Source File: Kafka2OdpsDemo.scala From MaxCompute-Spark with Apache License 2.0 | 5 votes |
package com.aliyun.odps.spark.examples.streaming.kafka import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton import org.apache.kafka.clients.consumer.ConsumerRecord import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.SparkConf import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies} object Kafka2OdpsDemo { def main(args: Array[String]): Unit = { val sparkConf = new SparkConf().setAppName("test") val ssc = new StreamingContext(sparkConf, Seconds(10)) // 请使用OSS作为Checkpoint存储,修改为有效OSS路径。OSS访问文档请参考 https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E ssc.checkpoint("oss://bucket/checkpointdir") // kafka配置参数 val kafkaParams = Map[String, Object]( "bootstrap.servers" -> "localhost:9092", "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> "testGroupId", "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false: java.lang.Boolean) ) // 创建kafka dstream val topics = Set("test") val recordDstream: InputDStream[ConsumerRecord[String, String]] = KafkaUtils.createDirectStream[String, String]( ssc, LocationStrategies.PreferConsistent, ConsumerStrategies.Subscribe[String, String](topics, kafkaParams) ) val dstream = recordDstream.map(f => (f.key(), f.value())) // 解析kafka数据并写入odps val data: DStream[String] = dstream.map(_._2) val wordsDStream: DStream[String] = data.flatMap(_.split(" ")) wordsDStream.foreachRDD(rdd => { val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf) import spark.implicits._ rdd.toDF("id").write.mode("append").saveAsTable("test_table") }) ssc.start() ssc.awaitTermination() } }
Example 25
Source File: ClusteringEvaluator.scala From streamDM with Apache License 2.0 | 5 votes |
package org.apache.spark.streamdm.evaluation import math._ import org.apache.spark.streamdm.core._ import org.apache.spark.streaming.dstream.DStream import org.apache.spark.rdd.RDD object ClusteringEvaluationUtil { def computeAllCentroids(input: RDD[(Example,Double)]): RDD[(Double,Example, Int)] = input.map{case (e,c) => (c,e)}.map{case (c,e) => (c,Array(e))}. reduceByKey((x,y) => x++y).map{case (c,e) => { val clSize = e.length val clSum = e.foldLeft(new Example(new NullInstance))( (a,x) => a.in match { case NullInstance() => new Example(x.in.map(x=>x)) case _ => new Example(a.in.add(x.in)) }) if(clSize>1) (c,new Example(clSum.in.map(x=>x/clSize)),clSize) else (c,clSum,1) }} }
Example 26
Source File: SWNearestNeighbors.scala From streamDM with Apache License 2.0 | 5 votes |
package org.apache.spark.streamdm.outlier import com.github.javacliparser.{FlagOption, IntOption} import org.apache.spark.internal.Logging import org.apache.spark.streamdm.core.Example import org.apache.spark.streamdm.core.specification.ExampleSpecification import org.apache.spark.streaming.dstream.DStream import scala.collection.mutable.Queue def outlierness(example: Example): Double = { val distances = window.map(p => p.in.distanceTo(example.in)) if(!distances.isEmpty) { val aggDistance = distances.reduce((d1, d2) => (d1 + d2)) / distances.size if(debug) logInfo("outlierness, %f, {%s}, %s, %d".format(aggDistance, example.in.getFeatureIndexArray().map(ins => ins._1).mkString(";"), example.out.getFeatureIndexArray().map(ins => ins._1).mkString(" "), distances.size)) aggDistance } else { 0.0 } } }
Example 27
Source File: StreamingJob.scala From confluent-platform-spark-streaming with Apache License 2.0 | 5 votes |
package example import com.typesafe.config.ConfigFactory import io.confluent.kafka.serializers.KafkaAvroDecoder import kafka.serializer.StringDecoder import org.apache.log4j.{Level, Logger} import org.apache.spark.sql.SQLContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkContext, SparkConf} object StreamingJob extends App { // Get job configuration val config = ConfigFactory.load() Logger.getLogger("example").setLevel(Level.toLevel(config.getString("loglevel"))) private val logger = Logger.getLogger(getClass) // Spark config and contexts val sparkMaster = config.getString("spark.master") val sparkConf = new SparkConf() .setMaster(sparkMaster) .setAppName("StreamingExample") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val sc = new SparkContext(sparkConf) val batchInterval = config.getInt("spark.batch.interval") val ssc = new StreamingContext(sc, Seconds(batchInterval)) // Create Kafka stream val groupId = config.getString("kafka.group.id") val topic = config.getString("topic") val kafkaParams = Map( "bootstrap.servers" -> config.getString("kafka.bootstrap.servers"), "schema.registry.url" -> config.getString("kafka.schema.registry.url"), "group.id" -> groupId ) @transient val kafkaStream: DStream[(String, Object)] = KafkaUtils.createDirectStream[String, Object, StringDecoder, KafkaAvroDecoder]( ssc, kafkaParams, Set(topic) ) // Load JSON strings into DataFrame kafkaStream.foreachRDD { rdd => // Get the singleton instance of SQLContext val sqlContext = SQLContext.getOrCreate(rdd.sparkContext) import sqlContext.implicits._ val topicValueStrings = rdd.map(_._2.toString) val df = sqlContext.read.json(topicValueStrings) df.printSchema() println("DataFrame count: " + df.count()) df.take(1).foreach(println) } ssc.start() ssc.awaitTermination() }
Example 28
Source File: Repartition.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.streaming.application import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig import org.apache.spark.streaming.dstream.DStream class Repartition() extends BenchBase { override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = { val reportTopic = config.reporterTopic val brokerList = config.brokerList lines.repartition(config.coreNumber).foreachRDD(rdd => rdd.foreachPartition( partLines => { val reporter = new KafkaReporter(reportTopic, brokerList) partLines.foreach{ case (inTime , content) => val outTime = System.currentTimeMillis() reporter.report(inTime,outTime) if(config.debugMode) { println("Event: " + inTime + ", " + outTime) } } })) } }
Example 29
Source File: Identity.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.streaming.application import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig import org.apache.spark.streaming.dstream.DStream class Identity() extends BenchBase { override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = { val reportTopic = config.reporterTopic val brokerList = config.brokerList lines.foreachRDD(rdd => rdd.foreachPartition( partLines => { val reporter = new KafkaReporter(reportTopic, brokerList) partLines.foreach{ case (inTime , content) => val outTime = System.currentTimeMillis() reporter.report(inTime, outTime) if(config.debugMode) { println("Event: " + inTime + ", " + outTime) } } })) } }
Example 30
Source File: FixWindow.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.streaming.application import com.intel.hibench.common.streaming.UserVisitParser import com.intel.hibench.common.streaming.metrics.KafkaReporter import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig import org.apache.spark.streaming.Duration import org.apache.spark.streaming.dstream.DStream class FixWindow(duration: Long, slideStep: Long) extends BenchBase { override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = { val reportTopic = config.reporterTopic val brokerList = config.brokerList lines.window(Duration(duration), Duration(slideStep)).map{ case (inTime, line) => { val uv = UserVisitParser.parse(line) (uv.getIp, (inTime, 1)) } }.reduceByKey((value, result) => { // maintain the min time of this window and count record number (Math.min(value._1, result._1), value._2 + result._2) }).foreachRDD( rdd => rdd.foreachPartition( results => { // report back to kafka val reporter = new KafkaReporter(reportTopic, brokerList) val outTime = System.currentTimeMillis() results.foreach(res => { (1 to (res._2._2)).foreach { _ => reporter.report(res._2._1, outTime) if(config.debugMode) { println("Event: " + res._2._1 + ", " + outTime) } } }) })) } }
Example 31
Source File: FlumeInput.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.input.flume import java.io.Serializable import java.net.InetSocketAddress import com.stratio.sparta.sdk.pipeline.input.Input import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import org.apache.spark.sql.Row import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.flume.FlumeUtils class FlumeInput(properties: Map[String, Serializable]) extends Input(properties) { val DEFAULT_FLUME_PORT = 11999 val DEFAULT_ENABLE_DECOMPRESSION = false val DEFAULT_MAXBATCHSIZE = 1000 val DEFAULT_PARALLELISM = 5 def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = { if (properties.getString("type").equalsIgnoreCase("pull")) { FlumeUtils.createPollingStream( ssc, getAddresses, storageLevel(sparkStorageLevel), maxBatchSize, parallelism ).map(data => Row(data.event.getBody.array)) } else { // push FlumeUtils.createStream( ssc, properties.getString("hostname"), properties.getString("port").toInt, storageLevel(sparkStorageLevel), enableDecompression ).map(data => Row(data.event.getBody.array)) } } private def getAddresses: Seq[InetSocketAddress] = properties.getMapFromJsoneyString("addresses") .map(values => (values.get("host"), values.get("port"))) .map { case (Some(address), None) => new InetSocketAddress(address, DEFAULT_FLUME_PORT) case (Some(address), Some(port)) => new InetSocketAddress(address, port.toInt) case _ => throw new IllegalStateException(s"Invalid configuration value for addresses : ${properties.get("addresses")}") } private def enableDecompression: Boolean = properties.hasKey("enableDecompression") match { case true => properties.getBoolean("enableDecompression") case false => DEFAULT_ENABLE_DECOMPRESSION } private def parallelism: Int = { properties.hasKey("parallelism") match { case true => properties.getString("parallelism").toInt case false => DEFAULT_PARALLELISM } } private def maxBatchSize: Int = properties.hasKey("maxBatchSize") match { case true => properties.getString("maxBatchSize").toInt case false => DEFAULT_MAXBATCHSIZE } }
Example 32
Source File: TwitterPopularTagsTest.scala From apache-spark-test with Apache License 2.0 | 5 votes |
package com.github.dnvriend.spark.streaming.twitter import com.github.dnvriend.TestSpec import com.github.dnvriend.spark.Tweet import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.dstream.{ DStream, ReceiverInputDStream } import org.apache.spark.streaming.twitter.TwitterUtils import org.scalatest.Ignore import pprint.Config.Colors.PPrintConfig import pprint._ import twitter4j.Status // see: https://dev.twitter.com/streaming/overview // see: https://dev.twitter.com/streaming/public // see: https://support.twitter.com/articles/20174643 // see: https://github.com/apache/bahir/blob/master/streaming-twitter/examples/src/main/scala/org/apache/spark/examples/streaming/twitter/TwitterPopularTags.scala // see: http://blog.originate.com/blog/2014/06/15/idiomatic-scala-your-options-do-not-match/ @Ignore class TwitterPopularTagsTest extends TestSpec { it should "find popular tags" in withStreamingContext(2, await = true) { spark => ssc => // val filters = Array("#scala", "#akka", "#spark", "@scala", "@akka", "@spark") val filters = Array("#summercamp", "#akka", "#scala", "#fastdata", "#spark", "#hadoop") val stream: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters) val msgs: DStream[Tweet] = stream .map(Tweet(_)) msgs.foreachRDD { rdd => rdd.take(10).foreach(pprint.pprintln) } val hashTags: DStream[String] = stream .filter(_.getLang == "en") .flatMap(status => status.getText.split(" ").filter(_.startsWith("#"))) val topCounts60 = hashTags .map((_, 1)) .reduceByKeyAndWindow(_ + _, Seconds(60)) .map { case (topic, count) => (count, topic) } .transform(_.sortByKey(ascending = false)) val topCounts10 = hashTags .map((_, 1)) .reduceByKeyAndWindow(_ + _, Seconds(10)) .map { case (topic, count) => (count, topic) } .transform(_.sortByKey(false)) topCounts60.foreachRDD(rdd => { val topList = rdd.take(10) pprint.pprintln("\nPopular topics in last 60 seconds (%s total):".format(rdd.count())) topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } }) topCounts10.foreachRDD(rdd => { val topList = rdd.take(10) pprint.pprintln("\nPopular topics in last 10 seconds (%s total):".format(rdd.count())) topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) } }) ssc.start() } }
Example 33
Source File: KafkaFlowExample.scala From kafka-scala-api with Apache License 2.0 | 5 votes |
package com.example.flow import org.apache.spark.streaming.dstream.DStream._ import org.apache.spark.streaming.dstream.{DStream, InputDStream} import org.joda.time.DateTime import org.json4s.DefaultFormats import org.json4s.jackson.JsonMethods._ import scala.util.Try case class Purchase(item_id: String, amount: BigDecimal, time: Long) case class Key(item_id: String, time: DateTime) case class Summary(item_id: String, time: DateTime, total: BigDecimal) object KafkaFlowExample { implicit val formats = DefaultFormats def extract(message: String): Option[(Key, BigDecimal)] = { for { parsed <- Try(parse(message)).toOption purchase <- parsed.extractOpt[Purchase] } yield { val datetime = new DateTime(purchase.time) val roundedTime = datetime.withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0) Key(purchase.item_id, roundedTime) -> purchase.amount } } def transformStream(stream: InputDStream[String]): DStream[Summary] = { stream .flatMap(extract) .reduceByKey(_ + _) .map { case (key, amount) => Summary(key.item_id, key.time, amount) } } }
Example 34
Source File: batchStream.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.spark.clustering.batchstream import scala.collection.mutable.ArrayBuffer import org.apache.spark.streaming.dstream.DStream import org.apache.spark.rdd.RDD import org.clustering4ever.spark.streamclustering.{Prototype, PointObj} class BatchStream( var voisinage: Int, var decayFactor: Double, var lambdaAge : Double, var nbNodesToAdd: Int, var minWeight: Double , var maxAge: Int, var alphaErr: Double, var d: Double) extends Serializable { def this() = this(voisinage = 0, decayFactor = 0.9, lambdaAge = 1.2, nbNodesToAdd = 3, minWeight = 1, maxAge = 250, alphaErr = 0.5, d = 0.99) var model: BatchStreamModel = new BatchStreamModel() def getModel: BatchStreamModel = model // Initializing the model. def initModelObj(txt: RDD[Array[Double]], dim: Int): BatchStream = { val nodes2 = txt.take(2) val node1 = nodes2(0) val node2 = nodes2(1) model.init2NodesObj(node1, node2, dim, 1) this } // Training on the model. def trainOnObj(data: DStream[PointObj], gstream: BatchStream, dirSortie: String, dim: Int, nbWind: Int) = { val timeUpdates = ArrayBuffer[Long](0L) var kk = 1 data.foreachRDD{ rdd => if ( rdd.count() > 0 ) { val initialTimeUpdate = System.currentTimeMillis() println("\n<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>--BatchStream--(batch: " + kk + " )..." + " rdd.count: " + rdd.count() + " \n") // Update model without var affectation model = model.updateObj(rdd, gstream, kk, dim) timeUpdates += (timeUpdates(timeUpdates.size - 1) + (System.currentTimeMillis() - initialTimeUpdate)) if (timeUpdates.length > 100) timeUpdates.remove(0) if ( (kk == 1) | (kk == nbWind / 9) | (kk == 2 * nbWind / 9) | (kk == 3 * nbWind / 9) | (kk == 4 * nbWind / 9) | (kk == 5 * nbWind / 9) | (kk == 6 * nbWind / 9) | (kk == 7 * nbWind / 9) | (kk == 8 * nbWind / 9) | (kk > (8 * nbWind / 9) + 10 & kk % 10 == 0) | (kk >= nbWind - 2) ) { rdd.context.parallelize(model.toStringProto).saveAsTextFile(dirSortie+"/Prototypes-"+kk) rdd.context.parallelize(model.toStringOutdatedProto).saveAsTextFile(dirSortie+"/OutdatedProtos-"+kk) rdd.context.parallelize(model.edges).saveAsTextFile(dirSortie+"/Edges-"+kk) rdd.context.parallelize(model.clusterWeights).saveAsTextFile(dirSortie+"/Weights-"+kk) rdd.context.parallelize(timeUpdates).saveAsTextFile(dirSortie+"/timeUpdates-"+kk) } kk += 1 } else println("-- BatchStream: empty rdd -- rdd.count : "+rdd.count()) } model } }
Example 35
Source File: MetricImplicits.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.timeseries.timely import java.io.PrintStream import java.net.Socket import java.nio.charset.StandardCharsets import io.gzet.timeseries.SimpleConfig import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.{Logging, Partitioner} object MetricImplicits extends Logging with SimpleConfig { def nonNegativeMod(x: Int, mod: Int): Int = { val rawMod = x % mod rawMod + (if (rawMod < 0) mod else 0) } class MetricPartitioner(partitions: Int) extends Partitioner { require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.") override def numPartitions: Int = partitions override def getPartition(key: Any): Int = { val k = key.asInstanceOf[MetricKey] nonNegativeMod(k.metricName.hashCode, partitions) } } implicit class Metrics(rdd: RDD[Metric]) { val partitions = rdd.partitions.length val partitioner = new MetricPartitioner(partitions) def publish() = { val sSortedMetricRDD = rdd filter { metric => metric.tags.nonEmpty } map { metric => (MetricKey(metric.name, metric.time), metric) } repartitionAndSortWithinPartitions partitioner sSortedMetricRDD.values foreachPartition { it: Iterator[Metric] => val sock = new Socket(timelyHost, timelyPort) val writer = new PrintStream(sock.getOutputStream, true, StandardCharsets.UTF_8.name) it foreach { metric => writer.println(metric.toPut) } writer.flush() } } } implicit class MetricStream(stream: DStream[Metric]) { def publish() = { stream foreachRDD { rdd => rdd.publish() } } } } case class Metric(name: String, time: Long, value: Double, tags: Map[String, String], viz: Option[String] = None) { def toPut = { val vizMap = if(viz.isDefined) List("viz" -> viz.get) else List[(String, String)]() val strTags = vizMap.union(tags.toList).map({ case (k, v) => s"$k=$v" }).mkString(" ") s"put $name $time $value $strTags" } } case class MetricKey(metricName: String, metricTime: Long) object MetricKey { implicit def orderingByMetricDate[A <: MetricKey] : Ordering[A] = { Ordering.by(fk => (fk.metricName, fk.metricTime)) } }
Example 36
Source File: GdeltTagger.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.tagging.gdelt import java.text.SimpleDateFormat import java.util.Date import com.typesafe.config.ConfigFactory import io.gzet.tagging.classifier.Classifier import io.gzet.tagging.html.HtmlHandler import io.gzet.tagging.html.HtmlHandler.Content import org.apache.spark.Accumulator import org.apache.spark.streaming.dstream.DStream import org.apache.spark.util.LongAccumulator import org.elasticsearch.spark._ class GdeltTagger() extends Serializable { val config = ConfigFactory.load().getConfig("io.gzet.kappa") val isoSdf = "yyyy-MM-dd HH:mm:ss" val esIndex = config.getString("gdeltIndex") val vectorSize = config.getInt("vectorSize") val minProba = config.getDouble("minProba") def predict(gdeltStream: DStream[String], batchId: LongAccumulator) = { // Extract HTML content val gdeltContent = fetchHtmlContent(gdeltStream) // Predict each RDD gdeltContent foreachRDD { batch => batch.cache() val count = batch.count() if (count > 0) { if (Classifier.model.isDefined) { val labels = Classifier.model.get.labels // Predict HashTags using latest Twitter model val textRdd = batch.map(_.body.get) val predictions = Classifier.predictProbabilities(textRdd) val taggedGdelt = batch.zip(predictions) map { case (content, probabilities) => val validLabels = probabilities filter { case (label, probability) => probability > minProba } val labels = validLabels.toSeq .sortBy(_._2) .reverse .map(_._1) (content, labels) } // Saving articles to Elasticsearch taggedGdelt map { case (content, hashTags) => gdeltToJson(content, hashTags.toArray) } saveToEs esIndex } else { // Saving articles to Elasticsearch batch map { content => gdeltToJson(content, Array()) } saveToEs esIndex } } batch.unpersist(blocking = false) } } private def gdeltToJson(content: Content, hashTags: Array[String]) = { val sdf = new SimpleDateFormat(isoSdf) Map( "time" -> sdf.format(new Date()), "body" -> content.body.get, "url" -> content.url, "tags" -> hashTags, "title" -> content.title ) } private def fetchHtmlContent(urlStream: DStream[String]) = { urlStream.map(_ -> 1).groupByKey().map(_._1) mapPartitions { urls => val sdf = new SimpleDateFormat(isoSdf) val htmlHandler = new HtmlHandler() val goose = htmlHandler.getGooseScraper urls map { url => htmlHandler.fetchUrl(goose, url, sdf) } } filter { content => content.isDefined && content.get.body.isDefined && content.get.body.get.length > 255 } map { content => content.get } } }
Example 37
Source File: KappaTagging.scala From Mastering-Spark-for-Data-Science with MIT License | 5 votes |
package io.gzet.tagging import com.typesafe.config.ConfigFactory import io.gzet.tagging.gdelt.GdeltTagger import io.gzet.tagging.twitter.TwitterHIS import org.apache.spark.SparkConf import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka.KafkaUtils import org.apache.spark.streaming.twitter.TwitterUtils import org.apache.spark.streaming.{Seconds, StreamingContext} import twitter4j.Status import twitter4j.auth.OAuthAuthorization import twitter4j.conf.ConfigurationBuilder object KappaTagging { final val config = ConfigFactory.load().getConfig("io.gzet.kappa") final val esNodes = config.getString("esNodes") final val batchSize = config.getInt("batchSize") def main(args: Array[String]) = { val sparkConf = new SparkConf().setAppName("GDELT Kappa tagging") val ssc = new StreamingContext(sparkConf, Seconds(batchSize)) val sc = ssc.sparkContext // Create a counter that can be shared accross batches val batchId = sc.longAccumulator("GZET") val twitterStream = createTwitterStream(ssc, Array[String]()) val twitterProcessor = new TwitterHIS() twitterProcessor.train(twitterStream, batchId) val gdeltStream = createGdeltStream(ssc) val gdeltProcessor = new GdeltTagger() gdeltProcessor.predict(gdeltStream, batchId) ssc.start() ssc.awaitTermination() } private def createTwitterStream(ssc: StreamingContext, filters: Array[String]): DStream[Status] = { TwitterUtils.createStream( ssc, getTwitterConfiguration, filters ) } private def getTwitterConfiguration = { val builder = new ConfigurationBuilder() builder.setOAuthConsumerKey(config.getString("apiKey")) builder.setOAuthConsumerSecret(config.getString("apiSecret")) builder.setOAuthAccessToken(config.getString("tokenKey")) builder.setOAuthAccessTokenSecret(config.getString("tokenSecret")) val configuration = builder.build() Some(new OAuthAuthorization(configuration)) } private def createGdeltStream(ssc: StreamingContext) = { val topics = Map( config.getString("kafkaTopic") -> config.getInt("kafkaTopicPartition") ) KafkaUtils.createStream( ssc, config.getString("zkQuorum"), config.getString("kafkaGroupId"), topics ).map(_._2) } }
Example 38
Source File: TrendingHashtags.scala From dataproc-pubsub-spark-streaming with Apache License 2.0 | 5 votes |
package demo import java.nio.charset.StandardCharsets import com.google.cloud.datastore._ import demo.DataStoreConverter.saveRDDtoDataStore import demo.HashTagsStreaming.processTrendingHashTags import org.apache.spark.SparkConf import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.pubsub.{PubsubUtils, SparkGCPCredentials} import org.apache.spark.streaming.{Seconds, StreamingContext} object TrendingHashtags { def createContext(projectID: String, windowLength: String, slidingInterval: String, checkpointDirectory: String) : StreamingContext = { // [START stream_setup] val sparkConf = new SparkConf().setAppName("TrendingHashtags") val ssc = new StreamingContext(sparkConf, Seconds(slidingInterval.toInt)) // Set the checkpoint directory val yarnTags = sparkConf.get("spark.yarn.tags") val jobId = yarnTags.split(",").filter(_.startsWith("dataproc_job")).head ssc.checkpoint(checkpointDirectory + '/' + jobId) // Create stream val messagesStream: DStream[String] = PubsubUtils .createStream( ssc, projectID, None, "tweets-subscription", // Cloud Pub/Sub subscription for incoming tweets SparkGCPCredentials.builder.build(), StorageLevel.MEMORY_AND_DISK_SER_2) .map(message => new String(message.getData(), StandardCharsets.UTF_8)) // [END stream_setup] //process the stream processTrendingHashTags(messagesStream, windowLength.toInt, slidingInterval.toInt, 10, //decoupled handler that saves each separate result for processed to datastore saveRDDtoDataStore(_, windowLength.toInt) ) ssc } def main(args: Array[String]): Unit = { if (args.length != 5) { System.err.println( """ | Usage: TrendingHashtags <projectID> <windowLength> <slidingInterval> <totalRunningTime> | | <projectID>: ID of Google Cloud project | <windowLength>: The duration of the window, in seconds | <slidingInterval>: The interval at which the window calculation is performed, in seconds | <totalRunningTime>: Total running time for the application, in minutes. If 0, runs indefinitely until termination. | <checkpointDirectory>: Directory used to store RDD checkpoint data | """.stripMargin) System.exit(1) } val Seq(projectID, windowLength, slidingInterval, totalRunningTime, checkpointDirectory) = args.toSeq // Create Spark context val ssc = StreamingContext.getOrCreate(checkpointDirectory, () => createContext(projectID, windowLength, slidingInterval, checkpointDirectory)) // Start streaming until we receive an explicit termination ssc.start() if (totalRunningTime.toInt == 0) { ssc.awaitTermination() } else { ssc.awaitTerminationOrTimeout(1000 * 60 * totalRunningTime.toInt) } } }
Example 39
Source File: HashTagsStreaming.scala From dataproc-pubsub-spark-streaming with Apache License 2.0 | 5 votes |
package demo import org.apache.spark.rdd.RDD import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.dstream.DStream object HashTagsStreaming { case class Popularity(tag: String, amount: Int) // [START extract] private[demo] def extractTrendingTags(input: RDD[String]): RDD[Popularity] = input.flatMap(_.split("\\s+")) // Split on any white character .filter(_.startsWith("#")) // Keep only the hashtags // Remove punctuation, force to lowercase .map(_.replaceAll("[,.!?:;]", "").toLowerCase) // Remove the first # .map(_.replaceFirst("^#", "")) .filter(!_.isEmpty) // Remove any non-words .map((_, 1)) // Create word count pairs .reduceByKey(_ + _) // Count occurrences .map(r => Popularity(r._1, r._2)) // Sort hashtags by descending number of occurrences .sortBy(r => (-r.amount, r.tag), ascending = true) // [END extract] def processTrendingHashTags(input: DStream[String], windowLength: Int, slidingInterval: Int, n: Int, handler: Array[Popularity] => Unit): Unit = { val sortedHashtags: DStream[Popularity] = input .window(Seconds(windowLength), Seconds(slidingInterval)) //create a window .transform(extractTrendingTags(_)) //apply transformation sortedHashtags.foreachRDD(rdd => { handler(rdd.take(n)) //take top N hashtags and save to external source }) } }
Example 40
Source File: package.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd} import org.apache.spark.streaming.dstream.DStream import org.infinispan.client.hotrod.RemoteCacheManager import org.infinispan.spark.config.ConnectorConfiguration import org.infinispan.spark.rdd.RemoteCacheManagerBuilder package object stream { implicit class InfinispanDStream[K, V](stream: DStream[(K, V)]) { private def getCacheManager(configuration: ConnectorConfiguration): RemoteCacheManager = { val rcm = RemoteCacheManagerBuilder.create(configuration) stream.context.sparkContext.addSparkListener(new SparkListener { override def onJobEnd(jobEnd: SparkListenerJobEnd) = rcm.stop() }) rcm } def writeToInfinispan(configuration: ConnectorConfiguration) = { val rcm = getCacheManager(configuration) val cache = getCache(configuration, rcm) val topologyConfig = getCacheTopology(cache.getCacheTopologyInfo) configuration.setServerList(topologyConfig) stream.foreachRDD(_.writeToInfinispan(configuration)) } } }
Example 41
Source File: DStreamKafkaWriter.scala From spark-kafka-writer with Apache License 2.0 | 5 votes |
package com.github.benfradet.spark.kafka.writer import org.apache.kafka.clients.producer.{Callback, ProducerRecord} import org.apache.spark.streaming.dstream.DStream import scala.reflect.ClassTag override def writeToKafka[K, V]( producerConfig: Map[String, Object], transformFunc: T => ProducerRecord[K, V], callback: Option[Callback] = None ): Unit = dStream.foreachRDD { rdd => val rddWriter = new RDDKafkaWriter[T](rdd) rddWriter.writeToKafka(producerConfig, transformFunc, callback) } }
Example 42
Source File: HttpInputDStreamAsync.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import com.ning.http.client.AsyncCompletionHandler import com.ning.http.client.AsyncHttpClient import com.ning.http.client.Response class HttpInputDStreamAsync( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiverAsync(storageLevel, url) } } class HttpReceiverAsync( storageLevel: StorageLevel, url: String) extends Receiver[String](storageLevel) with Logging { var asyncHttpClient: AsyncHttpClient = _ def onStop() { asyncHttpClient.close() logInfo("Disconnected from Http Server") } def onStart() { asyncHttpClient = new AsyncHttpClient() asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() { override def onCompleted(response: Response): Response = { store(response.getResponseBody) return response } override def onThrowable(t: Throwable) { restart("Error! Problems while connecting", t) } }); logInfo("Http Connection initiated") } } object HttpUtilsAsync { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String): DStream[String] = { new HttpInputDStreamAsync(ssc, storageLevel, url) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url) } }
Example 43
Source File: HivemallStreamingOps.scala From hivemall-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import scala.reflect.ClassTag import org.apache.spark.ml.feature.HmLabeledPoint import org.apache.spark.rdd.RDD import org.apache.spark.sql.{Row, DataFrame, SQLContext} import org.apache.spark.streaming.dstream.DStream final class HivemallStreamingOps(ds: DStream[HmLabeledPoint]) { def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext) : DStream[Row] = { ds.transform[Row] { rdd: RDD[HmLabeledPoint] => f(sqlContext.createDataFrame(rdd)).rdd } } } object HivemallStreamingOps { implicit def dataFrameToHivemallStreamingOps(ds: DStream[HmLabeledPoint]) : HivemallStreamingOps = { new HivemallStreamingOps(ds) } }
Example 44
Source File: StreamingTask.scala From spark-cassandra-stress with Apache License 2.0 | 5 votes |
package com.datastax.sparkstress import java.util.concurrent.TimeUnit import com.datastax.spark.connector.cql.CassandraConnector import com.datastax.spark.connector.streaming._ import com.datastax.sparkstress.RowGenerator.PerfRowGenerator import com.datastax.sparkstress.RowTypes._ import com.datastax.sparkstress.SparkStressImplicits._ import com.datastax.sparkstress.StressTask._ import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{StreamingContext, _} import scala.reflect.ClassTag abstract class StreamingTask[rowType]( val config: Config, val ss: SparkSession) (implicit ct:ClassTag[rowType]) extends StressTask { val ssc = new StreamingContext(ss.sparkContext, Seconds(config.streamingBatchIntervalSeconds)) val opsPerBatch = (config.numReceivers * config.receiverThroughputPerBatch) val estimatedReqRuntime: Long = ((config.totalOps / opsPerBatch) * config.streamingBatchIntervalSeconds) + 10 val terminationTime: Long = { if (config.terminationTimeMinutes == 0) { estimatedReqRuntime } else { val newTerminationTime: Long = TimeUnit.MINUTES.toSeconds(config.terminationTimeMinutes) if (estimatedReqRuntime <= newTerminationTime) { println(s"Using the estimated runtime (${estimatedReqRuntime} secs}) required to stream ${config.totalOps} since it is <= the requested runtime (${newTerminationTime} secs).") estimatedReqRuntime } else { println(s"Converting requested runtime of ${config.terminationTimeMinutes} min to ${newTerminationTime} secs.") newTerminationTime } } } def setupCQL() = { val cc = CassandraConnector(ss.sparkContext.getConf) cc.withSessionDo { session => if (config.deleteKeyspace) { println(s"Destroying Keyspace") session.execute(s"DROP KEYSPACE IF EXISTS ${config.keyspace}") } val kscql = getKeyspaceCql(config.keyspace, getLocalDC(cc), config.replicationFactor) val tbcql = getTableCql(config.table) println( s"""Running the following create statements\n$kscql\n${tbcql.mkString("\n")}""") session.execute(kscql) session.execute(s"USE ${config.keyspace}") for (cql <- tbcql) session.execute(cql) } printf("Done Setting up CQL Keyspace/Table\n") } def getTableCql(tbName: String): Seq[String] override def getGenerator: RowGenerator[PerfRowClass] = generator override def dstreamOps(dstream: DStream[PerfRowClass]): Unit = dstream.saveToCassandra(config.keyspace, config.table) }
Example 45
Source File: SparkStreamAdapterExample.scala From eventuate with Apache License 2.0 | 5 votes |
package com.rbmhtechnology.example.spark //#spark-stream-adapter import com.rbmhtechnology.eventuate._ import com.rbmhtechnology.eventuate.adapter.spark.SparkStreamAdapter import org.apache.spark._ import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream.DStream //# import akka.actor._ import com.rbmhtechnology.eventuate.log.EventLogWriter import com.rbmhtechnology.eventuate.log.leveldb.LeveldbEventLog import scala.collection.immutable._ import scala.io.Source object SparkStreamAdapterExample extends App { implicit val system: ActorSystem = ActorSystem(ReplicationConnection.DefaultRemoteSystemName) val logName: String = "L" val endpoint: ReplicationEndpoint = new ReplicationEndpoint(id = "1", logNames = Set(logName), logFactory = logId => LeveldbEventLog.props(logId), connections = Set()) val log: ActorRef = endpoint.logs(logName) val writer: EventLogWriter = new EventLogWriter("writer", log) endpoint.activate() //#spark-stream-adapter val sparkConfig = new SparkConf(true) .setAppName("adapter") .setMaster("local[4]") val sparkContext = new SparkContext(sparkConfig) val sparkStreamingContext = new StreamingContext(sparkContext, Seconds(1)) // Create an Eventuate Spark stream adapter val sparkStreamAdapter = new SparkStreamAdapter( sparkStreamingContext, system.settings.config) // Create a DStream from event log L by connecting to its replication endpoint val stream: DStream[DurableEvent] = sparkStreamAdapter.eventStream( id = "s1", host = "127.0.0.1", port = 2552, logName = "L", fromSequenceNr = 1L, storageLevel = StorageLevel.MEMORY_ONLY) // For processing in strict event storage order, use repartition(1) stream.repartition(1).foreachRDD(rdd => rdd.foreach(println)) // Start event stream processing sparkStreamingContext.start() //# // Generate new events from stdin val lines = Source.stdin.getLines() def prompt(): Unit = { if (lines.hasNext) lines.next() match { case "exit" => sparkStreamingContext.stop(stopSparkContext = true) system.terminate() case line => writer.write(Seq(line)) prompt() } } prompt() }
Example 46
Source File: MSNBCStreamingAdvanced.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.PrefixSpan import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object MSNBCStreamingAdvanced extends App { val conf = new SparkConf() .setAppName("MSNBC data initial streaming example") .setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, batchDuration = Seconds(10)) val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(trainSequences) val freqSequences = psModel.freqSequences.map(_.sequence).collect() val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999) val events: DStream[(Int, String)] = rawEvents.map(line => line.split(": ")) .map(kv => (kv(0).toInt, kv(1))) val countIds = events.map(e => (e._1, 1)) val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _) def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = { Some(runningCount.getOrElse(0) + newValues.sum) } val runningCounts = countIds.updateStateByKey[Int](updateFunction _) val duration = Seconds(20) val slide = Seconds(10) val rawSequences: DStream[(Int, String)] = events .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide) val sequences: DStream[Array[Array[Int]]] = rawSequences.map(_._2) .map(line => line.split(" ").map(_.toInt)) .map(_.map(Array(_))) print(">>> Analysing new batch of data") sequences.foreachRDD( rdd => rdd.foreach( array => { println(">>> Sequence: ") println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) freqSequences.count(_.deep == array.deep) match { case count if count > 0 => println("is frequent!") case _ => println("is not frequent.") } } ) ) print(">>> done") ssc.start() ssc.awaitTermination() }
Example 47
Source File: MSNBCStreamingExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License | 5 votes |
package com.github.maxpumperla.ml_spark.streaming import org.apache.spark.mllib.fpm.PrefixSpan import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.{SparkConf, SparkContext} object MSNBCStreamingExample extends App { val conf = new SparkConf() .setAppName("MSNBC data initial streaming example") .setMaster("local[4]") val sc = new SparkContext(conf) val ssc = new StreamingContext(sc, batchDuration = Seconds(10)) val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line => line.split(" ").map(_.toInt) } val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache() val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15) val psModel = prefixSpan.run(trainSequences) val freqSequences = psModel.freqSequences.map(_.sequence).collect() val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999) val sequences: DStream[Array[Array[Int]]] = rawSequences .map(line => line.split(" ").map(_.toInt)) .map(_.map(Array(_))) print(">>> Analysing new batch of data") sequences.foreachRDD( rdd => rdd.foreach( array => { println(">>> Sequence: ") println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]")) freqSequences.count(_.deep == array.deep) match { case count if count > 0 => println("is frequent!") case _ => println("is not frequent.") } } ) ) print(">>> done") ssc.start() ssc.awaitTermination() }
Example 48
Source File: PointDStreamExtensionsSpec.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.extensions import com.holdenkarau.spark.testing.StreamingActionBase import com.pygmalios.reactiveinflux.spark._ import com.pygmalios.reactiveinflux._ import org.apache.spark.streaming.dstream.DStream import org.junit.runner.RunWith import org.scalatest.BeforeAndAfterAll import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class PointDStreamExtensionsSpec extends StreamingActionBase with BeforeAndAfterAll { import PointRDDExtensionsSpec._ override def beforeAll: Unit = { super.beforeAll withInflux(_.create()) } override def afterAll: Unit = { withInflux(_.drop()) super.afterAll } test("write single point to Influx") { val points = List(point1) // Execute runAction(Seq(points), (dstream: DStream[Point]) => dstream.saveToInflux()) // Assert val result = withInflux( _.query(Query(s"SELECT * FROM $measurement1")).result.singleSeries) assert(result.rows.size == 1) val row = result.rows.head assert(row.time == point1.time) assert(row.values.size == 5) } }
Example 49
Source File: StreamingExample.scala From reactiveinflux-spark with Apache License 2.0 | 5 votes |
package com.pygmalios.reactiveinflux.spark.examples import com.pygmalios.reactiveinflux._ import com.pygmalios.reactiveinflux.spark._ import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.{Seconds, StreamingContext} import org.joda.time.DateTime import scala.concurrent.duration._ object StreamingExample extends App { val conf = new SparkConf() .setMaster("local[*]") .setAppName("Example") val ssc = new StreamingContext(conf, Seconds(1)) val point1 = Point( time = DateTime.now(), measurement = "measurement1", tags = Map( "tagKey1" -> "tagValue1", "tagKey2" -> "tagValue2"), fields = Map( "fieldKey1" -> "fieldValue1", "fieldKey2" -> 10.7) ) // Provide settings for reactiveinflux implicit val params = ReactiveInfluxDbName("example") implicit val awaitAtMost = 1.second // Create DStream of Influx points val queue = new scala.collection.mutable.Queue[RDD[Point]] val queueStream: DStream[Point] = ssc.queueStream(queue) // Add single RDD with a single Influx point to the DStream queue.enqueue(ssc.sparkContext.parallelize(Seq(point1))) // Save DStream to Influx queueStream.saveToInflux() // Start Spark streaming ssc.start() ssc.awaitTermination() }
Example 50
Source File: DStreamKafkaWriterSpec.scala From spark-kafka-writer with Apache License 2.0 | 5 votes |
package com.github.benfradet.spark.kafka.writer import org.apache.kafka.clients.producer._ import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream import scala.collection.mutable import scala.concurrent.duration._ class DStreamKafkaWriterSpec extends SKRSpec { "a DStreamKafkaWriter" when { "given a dstream" should { "write its content to Kafka" in { val localTopic = topic val msgs = (1 to 10).map(_.toString) val stream = createDStream(msgs) stream.writeToKafka( producerConfig, s => new ProducerRecord[String, String](localTopic, s) ) val results = collect(ssc, localTopic) ssc.start() eventually(timeout(30.seconds), interval(1.second)) { results shouldBe msgs } } "trigger a given callback for every write to Kafka" in { val localTopic = topic val msgs = (1 to 10).map(_.toString) val stream = createDStream(msgs) stream.writeToKafka( producerConfig, s => new ProducerRecord[String, String](localTopic, s), Some(new Callback with Serializable { override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = { SKRSpec.callbackTriggerCount.incrementAndGet() } }) ) ssc.start() eventually(timeout(30.seconds), interval(1.second)) { SKRSpec.callbackTriggerCount.get() shouldBe msgs.size } } } } private def createDStream(seq: Seq[String]): DStream[String] = { val q = mutable.Queue.empty[RDD[String]] q.enqueue(ssc.sparkContext.makeRDD(seq)) ssc.queueStream(q) } }
Example 51
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 52
Source File: SolRSupport.scala From Taxi360 with Apache License 2.0 | 5 votes |
package com.hadooparchitecturebook.taxi360.streaming.ingestion.solr import java.net.{ConnectException, SocketException} import java.util import org.apache.solr.client.solrj.impl.CloudSolrServer import org.apache.solr.client.solrj.request.UpdateRequest import org.apache.solr.common.{SolrException, SolrInputDocument} import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.DStream object SolRSupport { def indexDStreamOfDocs(zkHost:String, collection:String, batchSize:Int, docDStream:DStream[SolrInputDocument]): Unit ={ docDStream.foreachRDD(docRdd => { indexDoc(zkHost, collection, batchSize, docRdd) }) } def indexDoc(zkHost:String, collection:String, batchSize:Int, docRdd:RDD[SolrInputDocument]): Unit = { docRdd.foreachPartition(it => { val solrServer = CloudSolRServerBuilder.build(zkHost) val batch = new util.ArrayList[SolrInputDocument]() while (it.hasNext) { val inputDoc = it.next() batch.add(inputDoc) if (batch.size() >= batchSize) sendBatchToSolr(solrServer, collection, batch) } if (!batch.isEmpty()) sendBatchToSolr(solrServer, collection, batch) }) } def sendBatchToSolr( solrServer: CloudSolrServer, collection:String, batch:util.Collection[SolrInputDocument]) { val req = new UpdateRequest() req.setParam("collection", collection) req.add(batch) try { solrServer.request(req) } catch { case e:Exception => { if (shouldRetry(e)) { try { Thread.sleep(2000) } catch { case e1: InterruptedException => { Thread.interrupted() } } try { solrServer.request(req) } catch { case e1: Exception => { if (e1.isInstanceOf[RuntimeException]) { throw e1.asInstanceOf[RuntimeException] } else { throw new RuntimeException(e1) } } } } else { if (e.isInstanceOf[RuntimeException]) { throw e.asInstanceOf[RuntimeException] } else { throw new RuntimeException(e) } } } } finally { batch.clear() } } def shouldRetry( exc:Exception): Boolean = { val rootCause = SolrException.getRootCause(exc) rootCause.isInstanceOf[ConnectException] || rootCause.isInstanceOf[SocketException] } }
Example 53
Source File: StreamingUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0 | 5 votes |
package com.cloudera.sa.spark.unittest.streaming import org.apache.spark.rdd.RDD import org.apache.spark.streaming._ import org.apache.spark.streaming.dstream.DStream import org.apache.spark.{SparkConf, SparkContext} import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite} import scala.collection.mutable.Queue class StreamingUnitTest extends FunSuite with BeforeAndAfterEach with BeforeAndAfterAll{ @transient var sc: SparkContext = null @transient var ssc: StreamingContext = null override def beforeAll(): Unit = { val envMap = Map[String,String](("Xmx", "512m")) val sparkConfig = new SparkConf() sparkConfig.set("spark.broadcast.compress", "false") sparkConfig.set("spark.shuffle.compress", "false") sparkConfig.set("spark.shuffle.spill.compress", "false") sparkConfig.set("spark.io.compression.codec", "lzf") sc = new SparkContext("local[2]", "unit test", sparkConfig) ssc = new StreamingContext(sc, Milliseconds(200)) } override def afterAll(): Unit = { sc.stop() } test("Streaming word count") { val firstBatchRDD = sc.parallelize(Seq("a", "b", "c")) val secondBatchRDD = sc.parallelize(Seq("a", "e")) val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f")) val forthBatchRDD = sc.parallelize(Seq("a", "e")) val queue = new Queue[RDD[String]] queue.+=(firstBatchRDD) queue.+=(secondBatchRDD) queue.+=(thirdBatchRDD) queue.+=(forthBatchRDD) println(queue) val startTime = System.currentTimeMillis() val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1)) //ssc.queueStream(queue) dstream.checkpoint(Seconds(100)) val batchTotals:DStream[(String, Int)] = dstream.map(r => (r, 1)).reduceByKey(_ + _) val streamTotals = batchTotals.updateStateByKey( (seq:Seq[Int], opt:Option[Int]) => { if (!seq.isEmpty) { val totalCountForNew = seq.reduce(_ + _) if (opt.isEmpty) { Option(totalCountForNew) } else { Option(opt.get + totalCountForNew) } } else { opt } }) streamTotals.foreachRDD(rdd => { }) ssc.checkpoint("./tmp") ssc.start() ssc.awaitTerminationOrTimeout(2000) val endTime = System.currentTimeMillis() val rddList = streamTotals.slice(new Time(startTime), new Time(endTime)) rddList(0).collect().foreach(println) assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1) rddList(1).collect().foreach(println) assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2 == 2) rddList(2).collect().foreach(println) assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2 == 2) rddList(3).collect().foreach(println) assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2 == 3) } }
Example 54
Source File: DStreamFunctions.scala From spark-riak-connector with Apache License 2.0 | 5 votes |
package com.basho.riak.spark.streaming import com.basho.riak.spark.rdd.BucketDef import com.basho.riak.spark.rdd.connector.RiakConnector import com.basho.riak.spark.writer.{RiakWriter, WritableToRiak, WriteConf, WriteDataMapperFactory} import com.basho.riak.spark.writer.ts.RowDef import org.apache.spark.SparkContext import org.apache.spark.sql.Row import org.apache.spark.streaming.dstream.DStream class DStreamFunctions[T](dstream: DStream[T]) extends WritableToRiak[T] { override def sparkContext: SparkContext = dstream.context.sparkContext override def saveToRiak(bucketName: String, bucketType: String = BucketDef.DefaultBucketType, writeConf: WriteConf = WriteConf(sparkContext.getConf) )(implicit connector: RiakConnector, vwf: WriteDataMapperFactory[T, (String, Any)] ): Unit = { val writer = RiakWriter[T](connector, bucketType, bucketName, writeConf) dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, writer.write _)) } override def saveToRiakTS(bucketName: String, bucketType: String = BucketDef.DefaultBucketType, writeConf: WriteConf = WriteConf(sparkContext.getConf) )(implicit evidence: <:<[T, Row], connector: RiakConnector, vwf: WriteDataMapperFactory[T, RowDef] ): Unit = { val tsWriter = RiakWriter.tsWriter[T](connector, bucketType, bucketName, writeConf) dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, tsWriter.write _)) } }
Example 55
Source File: SocketTextStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver} import org.apache.spark.streaming.{Seconds, StreamingContext} class SocketTextStream extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ //var schema:String=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] //schema=MapUtil.get(map,key="schema").asInstanceOf[String] val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor //descriptor = schema :: descriptor descriptor = batchDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { val spark = pec.get[SparkSession](); val socketDF = spark .readStream .format("socket") .option("host",hostname) .option("port",port) .load() out.write(socketDF) } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.asInstanceOf[DStream[String]] } }
Example 56
Source File: KafkaStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.kafka.common.serialization.StringDeserializer import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.kafka010.KafkaUtils import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe class KafkaStream extends ConfigurableStreamingStop{ override var batchDuration: Int = _ override val authorEmail: String = "[email protected]" override val description: String = "Read data from kafka" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var brokers:String = _ var groupId:String = _ var topics:Array[String] = _ override def setProperties(map: Map[String, Any]): Unit = { brokers=MapUtil.get(map,key="brokers").asInstanceOf[String] groupId=MapUtil.get(map,key="groupId").asInstanceOf[String] topics=MapUtil.get(map,key="topics").asInstanceOf[String].split(",").map(x => x.trim) val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val brokers = new PropertyDescriptor().name("brokers").displayName("brokers").description("kafka brokers, seperated by ','").defaultValue("").required(true) val groupId = new PropertyDescriptor().name("groupId").displayName("groupId").description("kafka consumer group").defaultValue("group").required(true) val topics = new PropertyDescriptor().name("topics").displayName("topics").description("kafka topics").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = brokers :: descriptor descriptor = groupId :: descriptor descriptor = topics :: descriptor descriptor = batchDuration :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/KafkaStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def getDStream(ssc: StreamingContext): DStream[String] = { val kafkaParams = Map[String, Object]( "bootstrap.servers" -> brokers, "key.deserializer" -> classOf[StringDeserializer], "value.deserializer" -> classOf[StringDeserializer], "group.id" -> groupId, "auto.offset.reset" -> "latest", "enable.auto.commit" -> (false:java.lang.Boolean) ) val stream = KafkaUtils.createDirectStream[String,String]( ssc, PreferConsistent, Subscribe[String, String](topics, kafkaParams) ) stream.map(record => record.key() + "," + record.value()) //stream.asInstanceOf[DStream[ConsumerRecord]] } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {} }
Example 57
Source File: TextFileStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream class TextFileStream extends ConfigurableStreamingStop{ override var batchDuration: Int = _ override val authorEmail: String = "[email protected]" override val description: String = "Get text file streaming data" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var directory:String =_ override def setProperties(map: Map[String, Any]): Unit = { directory=MapUtil.get(map,key="directory").asInstanceOf[String] val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val directory = new PropertyDescriptor().name("directory").displayName("directory").description("HDFS directory to monitor for new file. Files must be written to the monitored directory by \"moving\" them from another location within the same file system ").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = directory :: descriptor descriptor = batchDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/TextFileStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.textFileStream(directory) dstream } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {} }
Example 58
Source File: SocketTextStreamByWindow.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.conf._ import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import org.apache.spark.streaming.{Seconds, StreamingContext} import org.apache.spark.streaming.dstream.DStream class SocketTextStreamByWindow extends ConfigurableStreamingStop { override val authorEmail: String = "[email protected]" override val description: String = "Receive text data from socket by window" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) override var batchDuration: Int = _ var hostname:String =_ var port:String=_ var windowDuration:Int = _ var slideDuration:Int = _ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String] windowDuration=MapUtil.get(map,key="windowDuration").asInstanceOf[String].toInt slideDuration=MapUtil.get(map,key="slideDuration").asInstanceOf[String].toInt val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data ").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) val windowDuration = new PropertyDescriptor().name("windowDuration").displayName("windowDuration").description("the window duration, the unit is seconds").defaultValue("").required(true) val slideDuration = new PropertyDescriptor().name("slideDuration").displayName("slideDuration").description("the slide duration, the unit is seconds").defaultValue("").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor descriptor = batchDuration :: descriptor descriptor = windowDuration :: descriptor descriptor = slideDuration :: descriptor descriptor } //TODO: change icon override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/SocketTextStreamByWindow.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def initialize(ctx: ProcessContext): Unit = { } override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = { } override def getDStream(ssc: StreamingContext): DStream[String] = { val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port)) dstream.window(Seconds(windowDuration),Seconds(slideDuration)) //dstream.reduceByWindow(_ + _,Seconds(windowDuration),Seconds(slideDuration)) } }
Example 59
Source File: FlumeStream.scala From piflow with BSD 2-Clause "Simplified" License | 5 votes |
package cn.piflow.bundle.streaming import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext} import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup} import cn.piflow.conf.bean.PropertyDescriptor import cn.piflow.conf.util.{ImageUtil, MapUtil} import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.flume._ class FlumeStream extends ConfigurableStreamingStop{ override var batchDuration: Int = _ override val authorEmail: String = "[email protected]" override val description: String = "Get data from flume" override val inportList: List[String] = List(Port.DefaultPort) override val outportList: List[String] = List(Port.DefaultPort) var hostname:String =_ var port:Int=_ override def setProperties(map: Map[String, Any]): Unit = { hostname=MapUtil.get(map,key="hostname").asInstanceOf[String] port=MapUtil.get(map,key="port").asInstanceOf[String].toInt val timing = MapUtil.get(map,key="batchDuration") batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt } override def getPropertyDescriptor(): List[PropertyDescriptor] = { var descriptor : List[PropertyDescriptor] = List() val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("hostname of the slave machine to which the flume data will be sent, the hostName must be one of the cluster worker node").defaultValue("").required(true) val port = new PropertyDescriptor().name("port").displayName("port").description("Port of the slave machine to which the flume data will be sent, the port should be greater than 10000").defaultValue("").required(true) val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true) descriptor = hostname :: descriptor descriptor = port :: descriptor descriptor = batchDuration :: descriptor descriptor } override def getIcon(): Array[Byte] = { ImageUtil.getImage("icon/streaming/FlumeStream.png") } override def getGroup(): List[String] = { List(StopGroup.StreamingGroup) } override def getDStream(ssc: StreamingContext): DStream[String] = { val flumeStream = FlumeUtils.createStream(ssc, hostname, port) flumeStream.map(e => new String(e.event.getBody.array(), "UTF-8")) } override def initialize(ctx: ProcessContext): Unit = {} override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {} }
Example 60
Source File: TestOutputStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming import java.io.{IOException, ObjectInputStream} import java.util.concurrent.ConcurrentLinkedQueue import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream.{DStream, ForEachDStream} import org.apache.spark.util.Utils class TestOutputStream[T: ClassTag](parent: DStream[T], val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]()) extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => { val collected = rdd.collect() output.add(collected) }, false) { // This is to clear the output buffer every it is read from a checkpoint @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { ois.defaultReadObject() output.clear() } }
Example 61
Source File: CubeWriterHelper.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.writer import java.sql.{Date, Timestamp} import akka.event.slf4j.SLF4JLogging import com.stratio.sparta.driver.factory.SparkContextFactory import com.stratio.sparta.driver.step.Cube import com.stratio.sparta.sdk.pipeline.aggregation.cube.{DimensionValue, DimensionValuesTime, MeasuresValues} import com.stratio.sparta.sdk.pipeline.output.Output import com.stratio.sparta.sdk.pipeline.schema.TypeOp import org.apache.spark.sql._ import org.apache.spark.streaming.dstream.DStream object CubeWriterHelper extends SLF4JLogging { def writeCube(cube: Cube, outputs: Seq[Output], stream: DStream[(DimensionValuesTime, MeasuresValues)]): Unit = { stream.map { case (dimensionValuesTime, measuresValues) => toRow(cube, dimensionValuesTime, measuresValues) }.foreachRDD(rdd => { if (!rdd.isEmpty()) { val sparkSession = SparkContextFactory.sparkSessionInstance val cubeDf = sparkSession.createDataFrame(rdd, cube.schema) val extraOptions = Map(Output.TableNameKey -> cube.name) val cubeAutoCalculatedFieldsDf = WriterHelper.write(cubeDf, cube.writerOptions, extraOptions, outputs) TriggerWriterHelper.writeTriggers(cubeAutoCalculatedFieldsDf, cube.triggers, cube.name, outputs) } else log.debug("Empty event received") }) } private[driver] def toRow(cube: Cube, dimensionValuesT: DimensionValuesTime, measures: MeasuresValues): Row = { val measuresSorted = measuresValuesSorted(measures.values) val rowValues = dimensionValuesT.timeConfig match { case None => val dimensionValues = dimensionsValuesSorted(dimensionValuesT.dimensionValues) dimensionValues ++ measuresSorted case Some(timeConfig) => val timeValue = Seq(timeFromDateType(timeConfig.eventTime, cube.dateType)) val dimFilteredByTime = filterDimensionsByTime(dimensionValuesT.dimensionValues, timeConfig.timeDimension) val dimensionValues = dimensionsValuesSorted(dimFilteredByTime) ++ timeValue val measuresValuesWithTime = measuresSorted dimensionValues ++ measuresValuesWithTime } Row.fromSeq(rowValues) } private[driver] def dimensionsValuesSorted(dimensionValues: Seq[DimensionValue]): Seq[Any] = dimensionValues.sorted.map(dimVal => dimVal.value) private[driver] def measuresValuesSorted(measures: Map[String, Option[Any]]): Seq[Any] = measures.toSeq.sortWith(_._1 < _._1).map(measure => measure._2.getOrElse(null)) private[driver] def filterDimensionsByTime(dimensionValues: Seq[DimensionValue], timeDimension: String): Seq[DimensionValue] = dimensionValues.filter(dimensionValue => dimensionValue.dimension.name != timeDimension) private[driver] def timeFromDateType(time: Long, dateType: TypeOp.Value): Any = { dateType match { case TypeOp.Date | TypeOp.DateTime => new Date(time) case TypeOp.Long => time case TypeOp.Timestamp => new Timestamp(time) case _ => time.toString } } }
Example 62
Source File: TwitterJsonInput.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.input.twitter import java.io.{Serializable => JSerializable} import com.google.gson.Gson import com.stratio.sparta.sdk.pipeline.input.Input import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import org.apache.spark.sql.Row import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.twitter.TwitterUtils import twitter4j.TwitterFactory import twitter4j.conf.ConfigurationBuilder import scala.util.{Failure, Success, Try} class TwitterJsonInput(properties: Map[String, JSerializable]) extends Input(properties) { System.setProperty("twitter4j.oauth.consumerKey", properties.getString("consumerKey")) System.setProperty("twitter4j.oauth.consumerSecret", properties.getString("consumerSecret")) System.setProperty("twitter4j.oauth.accessToken", properties.getString("accessToken")) System.setProperty("twitter4j.oauth.accessTokenSecret", properties.getString("accessTokenSecret")) val cb = new ConfigurationBuilder() val tf = new TwitterFactory(cb.build()) val twitterApi = tf.getInstance() val trends = twitterApi.getPlaceTrends(1).getTrends.map(trend => trend.getName) val terms: Option[Seq[String]] = Try(properties.getString("termsOfSearch")) match { case Success("") => None case Success(t: String) => Some(t.split(",").toSeq) case Failure(_) => None } val search = terms.getOrElse(trends.toSeq) def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = { TwitterUtils.createStream(ssc, None, search, storageLevel(sparkStorageLevel)) .map(stream => { val gson = new Gson() Row(gson.toJson(stream)) } ) } }
Example 63
Source File: SocketInput.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.plugin.input.socket import java.io.{Serializable => JSerializable} import com.stratio.sparta.sdk.pipeline.input.Input import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._ import org.apache.spark.sql.Row import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream class SocketInput(properties: Map[String, JSerializable]) extends Input(properties) { private val hostname : String = properties.getString("hostname") private val port : Int = properties.getInt("port") def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = { ssc.socketTextStream( hostname, port, storageLevel(sparkStorageLevel)) .map(data => Row(data)) } }
Example 64
Source File: InputStage.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.stage import com.stratio.sparta.sdk.pipeline.input.Input import com.stratio.sparta.serving.core.constants.AppConstant import com.stratio.sparta.serving.core.models.policy.PhaseEnum import com.stratio.sparta.serving.core.utils.ReflectionUtils import org.apache.spark.sql.Row import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream trait InputStage extends BaseStage { this: ErrorPersistor => def inputStreamStage(ssc: StreamingContext, input: Input): DStream[Row] = { val errorMessage = s"Something gone wrong creating the input stream for: ${policy.input.get.name}." val okMessage = s"Stream for Input: ${policy.input.get.name} created correctly." generalTransformation(PhaseEnum.InputStream, okMessage, errorMessage) { require(policy.storageLevel.isDefined, "You need to define the storage level") input.initStream(ssc, policy.storageLevel.get) } } def createInput(ssc: StreamingContext, refUtils: ReflectionUtils): Input = { val errorMessage = s"Something gone wrong creating the input: ${policy.input.get.name}. Please re-check the policy." val okMessage = s"Input: ${policy.input.get.name} created correctly." generalTransformation(PhaseEnum.Input, okMessage, errorMessage) { require(policy.input.isDefined, "You need at least one input in your policy") val classType = policy.input.get.configuration.getOrElse(AppConstant.CustomTypeKey, policy.input.get.`type`).toString refUtils.tryToInstantiate[Input](classType + Input.ClassSuffix, (c) => refUtils.instantiateParameterizable[Input](c, policy.input.get.configuration)) } } }
Example 65
Source File: RawDataStage.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.stage import com.stratio.sparta.driver.step.RawData import com.stratio.sparta.driver.writer.{RawDataWriterHelper, WriterOptions} import com.stratio.sparta.sdk.pipeline.output.{Output, SaveModeEnum} import com.stratio.sparta.serving.core.models.policy.{PhaseEnum, RawDataModel} import org.apache.spark.sql.Row import org.apache.spark.streaming.dstream.DStream trait RawDataStage extends BaseStage { this: ErrorPersistor => def saveRawData(rawModel: Option[RawDataModel], input: DStream[Row], outputs: Seq[Output]): Unit = if (rawModel.isDefined) { val rawData = rawDataStage() RawDataWriterHelper.writeRawData(rawData, outputs, input) } private[driver] def rawDataStage(): RawData = { val errorMessage = s"Something gone wrong saving the raw data. Please re-check the policy." val okMessage = s"RawData: created correctly." generalTransformation(PhaseEnum.RawData, okMessage, errorMessage) { require(policy.rawData.isDefined, "You need a raw data stage defined in your policy") require(policy.rawData.get.writer.tableName.isDefined, "You need a table name defined in your raw data stage") createRawData(policy.rawData.get) } } private[driver] def createRawData(rawDataModel: RawDataModel): RawData = { val okMessage = s"RawData created correctly." val errorMessage = s"Something gone wrong creating the RawData. Please re-check the policy." generalTransformation(PhaseEnum.RawData, okMessage, errorMessage) { RawData( rawDataModel.dataField, rawDataModel.timeField, WriterOptions( rawDataModel.writer.outputs, SaveModeEnum.Append, rawDataModel.writer.tableName, getAutoCalculatedFields(rawDataModel.writer.autoCalculatedFields), rawDataModel.writer.partitionBy, rawDataModel.writer.primaryKey ), rawDataModel.configuration) } } }
Example 66
Source File: ParserStage.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.stage import java.io.Serializable import akka.event.slf4j.SLF4JLogging import com.stratio.sparta.driver.writer.{TransformationsWriterHelper, WriterOptions} import com.stratio.sparta.sdk.pipeline.output.Output import com.stratio.sparta.sdk.pipeline.transformation.Parser import com.stratio.sparta.serving.core.constants.AppConstant import com.stratio.sparta.serving.core.models.policy.{PhaseEnum, TransformationModel} import com.stratio.sparta.serving.core.utils.ReflectionUtils import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.dstream.DStream import scala.util.{Failure, Success, Try} trait ParserStage extends BaseStage { this: ErrorPersistor => def parserStage(refUtils: ReflectionUtils, schemas: Map[String, StructType]): (Seq[Parser], Option[WriterOptions]) = (policy.transformations.get.transformationsPipe.map(parser => createParser(parser, refUtils, schemas)), policy.transformations.get.writer.map(writer => WriterOptions( writer.outputs, writer.saveMode, writer.tableName, getAutoCalculatedFields(writer.autoCalculatedFields), writer.partitionBy, writer.primaryKey ))) private[driver] def createParser(model: TransformationModel, refUtils: ReflectionUtils, schemas: Map[String, StructType]): Parser = { val classType = model.configuration.getOrElse(AppConstant.CustomTypeKey, model.`type`).toString val errorMessage = s"Something gone wrong creating the parser: $classType. Please re-check the policy." val okMessage = s"Parser: $classType created correctly." generalTransformation(PhaseEnum.Parser, okMessage, errorMessage) { val outputFieldsNames = model.outputFieldsTransformed.map(_.name) val schema = schemas.getOrElse(model.order.toString, throw new Exception("Can not find transformation schema")) refUtils.tryToInstantiate[Parser](classType + Parser.ClassSuffix, (c) => c.getDeclaredConstructor( classOf[Integer], classOf[Option[String]], classOf[Seq[String]], classOf[StructType], classOf[Map[String, Serializable]]) .newInstance(model.order, model.inputField, outputFieldsNames, schema, model.configuration) .asInstanceOf[Parser]) } } } object ParserStage extends SLF4JLogging { def executeParsers(row: Row, parsers: Seq[Parser]): Seq[Row] = if (parsers.size == 1) parseEvent(row, parsers.head) else parseEvent(row, parsers.head).flatMap(eventParsed => executeParsers(eventParsed, parsers.drop(1))) def parseEvent(row: Row, parser: Parser): Seq[Row] = Try { parser.parse(row) } match { case Success(eventParsed) => eventParsed case Failure(exception) => val error = s"Failure[Parser]: ${row.mkString(",")} | Message: ${exception.getLocalizedMessage}" + s" | Parser: ${parser.getClass.getSimpleName}" log.error(error, exception) Seq.empty[Row] } def applyParsers(input: DStream[Row], parsers: Seq[Parser], schema: StructType, outputs: Seq[Output], writerOptions: Option[WriterOptions]): DStream[Row] = { val transformedData = if (parsers.isEmpty) input else input.flatMap(row => executeParsers(row, parsers)) writerOptions.foreach(options => TransformationsWriterHelper.writeTransformations(transformedData, schema, outputs, options)) transformedData } }
Example 67
Source File: TriggerStage.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.stage import com.stratio.sparta.driver.step.Trigger import com.stratio.sparta.driver.writer.{TriggerWriterHelper, WriterOptions} import com.stratio.sparta.sdk.pipeline.output.Output import com.stratio.sparta.sdk.utils.AggregationTime import com.stratio.sparta.serving.core.models.policy.PhaseEnum import com.stratio.sparta.serving.core.models.policy.trigger.TriggerModel import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.Milliseconds import org.apache.spark.streaming.dstream.DStream trait TriggerStage extends BaseStage { this: ErrorPersistor => def triggersStreamStage(initSchema: StructType, inputData: DStream[Row], outputs: Seq[Output], window: Long): Unit = { val triggersStage = triggerStage(policy.streamTriggers) val errorMessage = s"Something gone wrong executing the triggers stream for: ${policy.input.get.name}." val okMessage = s"Triggers Stream executed correctly." generalTransformation(PhaseEnum.TriggerStream, okMessage, errorMessage) { triggersStage .groupBy(trigger => (trigger.overLast, trigger.computeEvery)) .foreach { case ((overLast, computeEvery), triggers) => val groupedData = (overLast, computeEvery) match { case (None, None) => inputData case (Some(overL), Some(computeE)) if (AggregationTime.parseValueToMilliSeconds(overL) == window) && (AggregationTime.parseValueToMilliSeconds(computeE) == window) => inputData case _ => inputData.window( Milliseconds( overLast.fold(window) { over => AggregationTime.parseValueToMilliSeconds(over) }), Milliseconds( computeEvery.fold(window) { computeEvery => AggregationTime.parseValueToMilliSeconds(computeEvery) })) } TriggerWriterHelper.writeStream(triggers, streamTemporalTable(policy.streamTemporalTable), outputs, groupedData, initSchema) } } } def triggerStage(triggers: Seq[TriggerModel]): Seq[Trigger] = triggers.map(trigger => createTrigger(trigger)) private[driver] def createTrigger(trigger: TriggerModel): Trigger = { val okMessage = s"Trigger: ${trigger.name} created correctly." val errorMessage = s"Something gone wrong creating the trigger: ${trigger.name}. Please re-check the policy." generalTransformation(PhaseEnum.Trigger, okMessage, errorMessage) { Trigger( trigger.name, trigger.sql, trigger.overLast, trigger.computeEvery, WriterOptions( trigger.writer.outputs, trigger.writer.saveMode, trigger.writer.tableName, getAutoCalculatedFields(trigger.writer.autoCalculatedFields), trigger.writer.primaryKey, trigger.writer.partitionBy ), trigger.configuration) } } private[driver] def streamTemporalTable(policyTableName: Option[String]): String = policyTableName.flatMap(tableName => if (tableName.nonEmpty) Some(tableName) else None) .getOrElse("stream") }
Example 68
Source File: CubeOperations.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.step import akka.event.slf4j.SLF4JLogging import com.stratio.sparta.sdk.pipeline.aggregation.cube.{DimensionValue, DimensionValuesTime, InputFields, TimeConfig} import com.stratio.sparta.sdk.pipeline.schema.TypeOp import com.stratio.sparta.sdk.utils.AggregationTime import org.apache.spark.sql.Row import org.apache.spark.streaming.dstream.DStream import org.joda.time.DateTime import scala.util.{Failure, Success, Try} def extractDimensionsAggregations(inputStream: DStream[Row]): DStream[(DimensionValuesTime, InputFields)] = { inputStream.mapPartitions(rows => rows.flatMap(row => Try { val dimensionValues = for { dimension <- cube.dimensions value = row.get(cube.initSchema.fieldIndex(dimension.field)) (precision, dimValue) = dimension.dimensionType.precisionValue(dimension.precisionKey, value) } yield DimensionValue(dimension, TypeOp.transformValueByTypeOp(precision.typeOp, dimValue)) cube.expiringDataConfig match { case None => (DimensionValuesTime(cube.name, dimensionValues), InputFields(row, UpdatedValues)) case Some(expiringDataConfig) => val eventTime = extractEventTime(dimensionValues) val timeDimension = expiringDataConfig.timeDimension (DimensionValuesTime(cube.name, dimensionValues, Option(TimeConfig(eventTime, timeDimension))), InputFields(row, UpdatedValues)) } } match { case Success(dimensionValuesTime) => Some(dimensionValuesTime) case Failure(exception) => val error = s"Failure[Aggregations]: ${row.toString} | ${exception.getLocalizedMessage}" log.error(error, exception) None }), true) } private[driver] def extractEventTime(dimensionValues: Seq[DimensionValue]) = { val timeDimension = cube.expiringDataConfig.get.timeDimension val dimensionsDates = dimensionValues.filter(dimensionValue => dimensionValue.dimension.name == timeDimension) if (dimensionsDates.isEmpty) getDate else AggregationTime.getMillisFromSerializable(dimensionsDates.head.value) } private[driver] def getDate: Long = { val checkpointGranularity = cube.expiringDataConfig.get.granularity AggregationTime.truncateDate(DateTime.now(), checkpointGranularity) } }
Example 69
Source File: TriggerWriterHelper.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.writer import akka.event.slf4j.SLF4JLogging import org.apache.spark.sql.{DataFrame, Row} import com.stratio.sparta.driver.exception.DriverException import com.stratio.sparta.driver.factory.SparkContextFactory import com.stratio.sparta.driver.schema.SchemaHelper import com.stratio.sparta.driver.step.Trigger import com.stratio.sparta.sdk.pipeline.output.Output import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.dstream.DStream import scala.util.{Failure, Success, Try} object TriggerWriterHelper extends SLF4JLogging { def writeStream(triggers: Seq[Trigger], inputTableName: String, outputs: Seq[Output], streamData: DStream[Row], schema: StructType): Unit = { streamData.foreachRDD(rdd => { val parsedDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, schema) writeTriggers(parsedDataFrame, triggers, inputTableName, outputs) }) } //scalastyle:off def writeTriggers(dataFrame: DataFrame, triggers: Seq[Trigger], inputTableName: String, outputs: Seq[Output]): Unit = { val sparkSession = dataFrame.sparkSession if (triggers.nonEmpty && isCorrectTableName(inputTableName)) { if (!sparkSession.catalog.tableExists(inputTableName)) { dataFrame.createOrReplaceTempView(inputTableName) log.debug(s"Registering temporal table in Spark with name: $inputTableName") } val tempTables = triggers.flatMap(trigger => { log.debug(s"Executing query in Spark: ${trigger.sql}") val queryDf = Try(sparkSession.sql(trigger.sql)) match { case Success(sqlResult) => sqlResult case Failure(exception: org.apache.spark.sql.AnalysisException) => log.warn("Warning running analysis in Catalyst in the query ${trigger.sql} in trigger ${trigger.name}", exception.message) throw DriverException(exception.getMessage, exception) case Failure(exception) => log.warn(s"Warning running sql in the query ${trigger.sql} in trigger ${trigger.name}", exception.getMessage) throw DriverException(exception.getMessage, exception) } val extraOptions = Map(Output.TableNameKey -> trigger.name) if (!queryDf.rdd.isEmpty()) { val autoCalculatedFieldsDf = WriterHelper.write(queryDf, trigger.writerOptions, extraOptions, outputs) if (isCorrectTableName(trigger.name) && !sparkSession.catalog.tableExists(trigger.name)) { autoCalculatedFieldsDf.createOrReplaceTempView(trigger.name) log.debug(s"Registering temporal table in Spark with name: ${trigger.name}") } else log.warn(s"The trigger ${trigger.name} have incorrect name, is impossible to register as temporal table") Option(trigger.name) } else None }) tempTables.foreach(tableName => if (isCorrectTableName(tableName) && sparkSession.catalog.tableExists(tableName)) { sparkSession.catalog.dropTempView(tableName) log.debug(s"Dropping temporal table in Spark with name: $tableName") } else log.debug(s"Impossible to drop table in Spark with name: $tableName")) if (isCorrectTableName(inputTableName) && sparkSession.catalog.tableExists(inputTableName)) { sparkSession.catalog.dropTempView(inputTableName) log.debug(s"Dropping temporal table in Spark with name: $inputTableName") } else log.debug(s"Impossible to drop table in Spark: $inputTableName") } else { if (triggers.nonEmpty && !isCorrectTableName(inputTableName)) log.warn(s"Incorrect table name $inputTableName and the triggers could have errors and not have been " + s"executed") } } //scalastyle:on private[driver] def isCorrectTableName(tableName: String): Boolean = tableName.nonEmpty && tableName != "" && tableName.toLowerCase != "select" && tableName.toLowerCase != "project" && !tableName.contains("-") && !tableName.contains("*") && !tableName.contains("/") }
Example 70
Source File: RawDataWriterHelper.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.writer import com.stratio.sparta.driver.factory.SparkContextFactory import com.stratio.sparta.driver.step.RawData import com.stratio.sparta.sdk.pipeline.output.Output import com.stratio.sparta.sdk.utils.AggregationTime import org.apache.spark.sql.Row import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType} import org.apache.spark.streaming.dstream.DStream object RawDataWriterHelper { def writeRawData(rawData: RawData, outputs: Seq[Output], input: DStream[Row]): Unit = { val RawSchema = StructType(Seq( StructField(rawData.timeField, TimestampType, nullable = false), StructField(rawData.dataField, StringType, nullable = true))) val eventTime = AggregationTime.millisToTimeStamp(System.currentTimeMillis()) input.map(row => Row.merge(Row(eventTime), row)) .foreachRDD(rdd => { if (!rdd.isEmpty()) { val rawDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, RawSchema) WriterHelper.write(rawDataFrame, rawData.writerOptions, Map.empty[String, String], outputs) } }) } }
Example 71
Source File: StreamingTestMethod.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.test import java.io.Serializable import scala.language.implicitConversions import scala.math.pow import com.twitter.chill.MeatLocker import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues import org.apache.commons.math3.stat.inference.TTest import org.apache.spark.internal.Logging import org.apache.spark.streaming.dstream.DStream import org.apache.spark.util.StatCounter private[stat] object StreamingTestMethod { // Note: after new `StreamingTestMethod`s are implemented, please update this map. private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map( "welch" -> WelchTTest, "student" -> StudentTTest) def getTestMethodFromName(method: String): StreamingTestMethod = TEST_NAME_TO_OBJECT.get(method) match { case Some(test) => test case None => throw new IllegalArgumentException( "Unrecognized method name. Supported streaming test methods: " + TEST_NAME_TO_OBJECT.keys.mkString(", ")) } }
Example 72
Source File: TransformationsWriterHelper.scala From sparta with Apache License 2.0 | 5 votes |
package com.stratio.sparta.driver.writer import com.stratio.sparta.driver.factory.SparkContextFactory import com.stratio.sparta.sdk.pipeline.output.Output import org.apache.spark.sql.Row import org.apache.spark.sql.types.StructType import org.apache.spark.streaming.dstream.DStream object TransformationsWriterHelper { def writeTransformations(input: DStream[Row], inputSchema: StructType, outputs: Seq[Output], writerOptions: WriterOptions): Unit = { input.foreachRDD(rdd => if (!rdd.isEmpty()) { val transformationsDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, inputSchema) WriterHelper.write(transformationsDataFrame, writerOptions, Map.empty[String, String], outputs) } ) } }
Example 73
Source File: L3-DStreamAggregation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditAggregationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditAggregationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val recCount = comments.count() val recCountValue = comments.countByValue() val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString)) .flatMap(body => body.split(" ")) .map(word => 1) .reduce(_ + _) ssc.start() ssc.awaitTermination() } }
Example 74
Source File: L3-DStreamWindowAndAction.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditWindowAndActionApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditWindowAndActionApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString) val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5)) val windowedCounts = windowedRecs.countByValue() windowedCounts.print(10) windowedCounts.saveAsObjectFiles("subreddit", "obj") windowedCounts.saveAsTextFiles("subreddit", "txt") globalCount.saveAsHadoopFiles("subreddit", "hadoop", classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]]) globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop", classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]]) comments.foreachRDD(rdd => { LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count())) }) ssc.start() ssc.awaitTermination() } }
Example 75
Source File: L3-DStreamVariation.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditVariationApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditVariationApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val merged = comments.union(comments) val repartitionedComments = comments.repartition(4) val rddMin = comments.glom().map(arr => arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt))) ssc.start() ssc.awaitTermination() } }
Example 76
Source File: L3-DStreamKeyValue.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date import org.apache.spark.HashPartitioner object RedditKeyValueApp { def main(args: Array[String]) { if (args.length != 3) { System.err.println( "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>") System.exit(1) } val Seq(appName, inputPath, inputPathPopular) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .groupByKey() .map(r => (r._2.sum, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1)) .reduceByKey(_ + _) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length)) .combineByKey( (v) => (v, 1), (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1), (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2), new HashPartitioner(ssc.sparkContext.defaultParallelism)) .map({ case (k, v) => (k, v._1 / v._2.toFloat) }) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubreddit2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2) val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec)) val keyedBySubredditCo2 = popular.map(rec => ({ val t = rec.split(",") (t(1).split("/")(4), t(0)) })) val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2) val checkpointPath = "/tmp" ssc.checkpoint(checkpointPath) val updateFunc = (values: Seq[Int], state: Option[Int]) => { val currentCount = values.sum val previousCount = state.getOrElse(0) Some(currentCount + previousCount) } val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1)) val globalCount = keyedBySubredditState.updateStateByKey(updateFunc) .map(r => (r._2, r._1)) .transform(rdd => rdd.sortByKey(ascending = false)) ssc.start() ssc.awaitTermination() } }
Example 77
Source File: L3-DStreamMapping.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkContext import org.apache.spark.SparkConf import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext } import org.apache.hadoop.io.{ Text, LongWritable, IntWritable } import org.apache.hadoop.fs.Path import org.apache.hadoop.mapreduce.lib.input.TextInputFormat import org.apache.spark.streaming.dstream.DStream import org.apache.hadoop.mapred.TextOutputFormat import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat } import org.apache.spark.streaming.dstream.PairDStreamFunctions import org.apache.log4j.LogManager import org.json4s._ import org.json4s.native.JsonMethods._ import java.text.SimpleDateFormat import java.util.Date object RedditMappingApp { def main(args: Array[String]) { if (args.length != 2) { System.err.println( "Usage: RedditMappingApp <appname> <input_path>") System.exit(1) } val Seq(appName, inputPath) = args.toSeq val LOG = LogManager.getLogger(this.getClass) val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) val ssc = new StreamingContext(conf, Seconds(1)) LOG.info("Started at %d".format(ssc.sparkContext.startTime)) val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString) val sdf = new SimpleDateFormat("yyyy-MM-dd") val tsKey = "created_utc" val secs = 1000L val keyedByDay = comments.map(rec => { val ts = (parse(rec) \ tsKey).values (sdf.format(new Date(ts.toString.toLong * secs)), rec) }) val keyedByDayPart = comments.mapPartitions(iter => { var ret = List[(String, String)]() while (iter.hasNext) { val rec = iter.next val ts = (parse(rec) \ tsKey).values ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec) } ret.iterator }) val wordTokens = comments.map(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val wordTokensFlat = comments.flatMap(rec => { ((parse(rec) \ "body")).values.toString.split(" ") }) val filterSubreddit = comments.filter(rec => (parse(rec) \ "subreddit").values.toString.equals("AskReddit")) val sortedByAuthor = comments.transform(rdd => (rdd.sortBy(rec => (parse(rec) \ "author").values.toString))) ssc.start() ssc.awaitTermination() } }
Example 78
Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import java.util.Timer import java.util.TimerTask import scala.reflect.ClassTag import org.apache.http.client.methods.HttpGet import org.apache.http.impl.client.CloseableHttpClient import org.apache.http.impl.client.HttpClients import org.apache.http.util.EntityUtils import org.apache.spark.Logging import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.api.java.JavaDStream import org.apache.spark.streaming.api.java.JavaDStream.fromDStream import org.apache.spark.streaming.api.java.JavaStreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver class HttpInputDStream( @transient ssc_ : StreamingContext, storageLevel: StorageLevel, url: String, interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging { def getReceiver(): Receiver[String] = { new HttpReceiver(storageLevel, url, interval) } } class HttpReceiver( storageLevel: StorageLevel, url: String, interval: Long) extends Receiver[String](storageLevel) with Logging { var httpClient: CloseableHttpClient = _ var trigger: Timer = _ def onStop() { httpClient.close() logInfo("Disconnected from Http Server") } def onStart() { httpClient = HttpClients.createDefault() trigger = new Timer() trigger.scheduleAtFixedRate(new TimerTask { def run() = doGet() }, 0, interval * 1000) logInfo("Http Receiver initiated") } def doGet() { logInfo("Fetching data from Http source") val response = httpClient.execute(new HttpGet(url)) try { val content = EntityUtils.toString(response.getEntity()) store(content) } catch { case e: Exception => restart("Error! Problems while connecting", e) } finally { response.close() } } } object HttpUtils { def createStream( ssc: StreamingContext, storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2, url: String, interval: Long): DStream[String] = { new HttpInputDStream(ssc, storageLevel, url, interval) } def createStream( jssc: JavaStreamingContext, storageLevel: StorageLevel, url: String, interval: Long): JavaDStream[String] = { implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]] createStream(jssc.ssc, storageLevel, url, interval) } }
Example 79
Source File: L7-2-3Tachyon.scala From prosparkstreaming with Apache License 2.0 | 5 votes |
package org.apress.prospark import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.Seconds import org.apache.spark.streaming.StreamingContext import org.apache.spark.streaming.dstream.DStream import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions object ReferrerApp { def main(args: Array[String]) { if (args.length != 7) { System.err.println( "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>") System.exit(1) } val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq val conf = new SparkConf() .setAppName(appName) .setJars(SparkContext.jarOfClass(this.getClass).toSeq) .set("spark.externalBlockStore.url", tachyonUrl) val ssc = new StreamingContext(conf, Seconds(10)) ssc.checkpoint(checkpointDir) val clickstream = ssc.socketTextStream(hostname, port.toInt) .map(rec => rec.split("\\t")) .persist(StorageLevel.OFF_HEAP) val topRefStream = clickstream .map(rec => { var prev_title = rec(3) if (!prev_title.startsWith("other")) { prev_title = "wikipedia" } (prev_title, 1) }) val topSparkStream = clickstream .filter(rec => rec(4).equals("Apache_Spark")) .map(rec => (rec(3), 1)) saveTopKeys(topRefStream, outputPathTop) saveTopKeys(topSparkStream, outputPathSpark) ssc.start() ssc.awaitTermination() } def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) { clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0))) .repartition(1) .map(rec => (rec._2, rec._1)) .transform(rec => rec.sortByKey(ascending = false)) .saveAsTextFiles(outputPath) } }