org.apache.spark.streaming.dstream.DStream Scala Example

Source File: SocketTextSource.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming.sources

import org.apache.spark.sql.SQLContext
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.sources.{BaseRelation, SchemaRelationProvider}
import org.apache.spark.sql.streaming.StreamPlan
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

class SocketTextSource extends SchemaRelationProvider {
  override def createRelation(
      sqlContext: SQLContext,
      parameters: Map[String, String],
      schema: StructType): BaseRelation = {

    require(parameters.contains("host") &&
      parameters.contains("port") &&
      parameters.contains("messageToRow"))

    val messageToRow = {
      try {
        val clz = Class.forName(parameters("messageToRow"))
        clz.newInstance().asInstanceOf[MessageToRowConverter]
      } catch {
        case e: Exception => sys.error(s"Failed to load class : ${e.toString}")
      }
    }

    new SocketTextRelation(
      parameters("host"),
      parameters("port").toInt,
      messageToRow,
      schema,
      sqlContext)
  }
}


case class SocketTextRelation(
    host: String,
    port: Int,
    messageToRowConverter: MessageToRowConverter,
    val schema: StructType,
    @transient val sqlContext: SQLContext)
  extends StreamBaseRelation
  with StreamPlan {

  // Currently only support Kafka with String messages
  @transient private val socketStream = streamSqlContext.streamingContext.socketTextStream(
    host, port)

  @transient val stream: DStream[InternalRow] =
    socketStream.map(messageToRowConverter.toRow(_, schema))
}

Source File: TestOutputStream.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}
import java.util.concurrent.ConcurrentLinkedQueue

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output.add(collected)
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
}

Source File: JavaTestUtils.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming

import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
import scala.reflect.ClassTag

import java.util.{List => JList}
import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStreamLike, JavaDStream, JavaStreamingContext}
import org.apache.spark.streaming._
import java.util.ArrayList
import collection.JavaConversions._
import org.apache.spark.api.java.JavaRDDLike
import org.apache.spark.streaming.dstream.DStream


  def runStreamsWithPartitions[V](ssc: JavaStreamingContext, numBatches: Int,
      numExpectedOutput: Int): JList[JList[JList[V]]] = {
    implicit val cm: ClassTag[V] =
      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
    val res = runStreamsWithPartitions[V](ssc.ssc, numBatches, numExpectedOutput)
    val out = new ArrayList[JList[JList[V]]]()
    res.map{entry =>
      val lists = entry.map(new ArrayList[V](_))
      out.append(new ArrayList[JList[V]](lists))
    }
    out
  }
}

object JavaTestUtils extends JavaTestBase {
  override def maxWaitTimeMillis = 20000

}

object JavaCheckpointTestUtils extends JavaTestBase {
  override def actuallyWait = true
}

Source File: TestOutputStream.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output += collected
  }) {

  // This is to clear the output buffer every it is read from a checkpoint
  //这是清除输出缓冲区,它是从一个检查点读取
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
}

Source File: WeatherDataStream.scala From spark-scala with Creative Commons Zero v1.0 Universal

5 votes

package com.supergloo

import com.killrweather.data.Weather.RawWeatherData
import kafka.serializer.StringDecoder
import org.apache.log4j.Logger
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka.KafkaUtils


    parsedWeatherStream.map { weather =>
      (weather.wsid, weather.year, weather.month, weather.day, weather.oneHourPrecip)
    }.saveToCassandra(CassandraKeyspace, CassandraTableDailyPrecip)
  }

  def ingestStream(rawWeatherStream: InputDStream[(String, String)]): DStream[RawWeatherData] = {
    val parsedWeatherStream = rawWeatherStream.map(_._2.split(","))
      .map(RawWeatherData(_))
    parsedWeatherStream
  }
}

Source File: JavaTestUtils.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming

import scala.collection.mutable.{SynchronizedBuffer, ArrayBuffer}
import scala.reflect.ClassTag

import java.util.{List => JList}
import org.apache.spark.streaming.api.java.{JavaPairDStream, JavaDStreamLike, JavaDStream, JavaStreamingContext}
import org.apache.spark.streaming._
import java.util.ArrayList
import collection.JavaConversions._
import org.apache.spark.api.java.JavaRDDLike
import org.apache.spark.streaming.dstream.DStream


  def runStreamsWithPartitions[V](ssc: JavaStreamingContext, numBatches: Int,
      numExpectedOutput: Int): JList[JList[JList[V]]] = {
    implicit val cm: ClassTag[V] =
      implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[V]]
    val res = runStreamsWithPartitions[V](ssc.ssc, numBatches, numExpectedOutput)
    val out = new ArrayList[JList[JList[V]]]()
    res.map{entry =>
      val lists = entry.map(new ArrayList[V](_))
      out.append(new ArrayList[JList[V]](lists))
    }
    out
  }
}

object JavaTestUtils extends JavaTestBase {
  override def maxWaitTimeMillis = 20000

}

object JavaCheckpointTestUtils extends JavaTestBase {
  override def actuallyWait = true
}

Source File: MQTTUtils.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.mqtt

import scala.reflect.ClassTag

import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.{JavaReceiverInputDStream, JavaStreamingContext, JavaDStream}
import org.apache.spark.streaming.dstream.{ReceiverInputDStream, DStream}

object MQTTUtils {
  
  def createStream(
      jssc: JavaStreamingContext,
      brokerUrl: String,
      topic: String,
      storageLevel: StorageLevel
    ): JavaReceiverInputDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, brokerUrl, topic, storageLevel)
  }
}

Source File: TestOutputStream.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output += collected
  }) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
}

Source File: TestOutputStream.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}
import java.util.concurrent.ConcurrentLinkedQueue

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output.add(collected)
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
}

Source File: StreamPlan.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

private[streaming] object StreamPlan {
  val currentContext = new ThreadLocal[StreamSQLContext]()
}

trait StreamPlan {
  protected var validTime: Time = null
  def streamSqlContext = StreamPlan.currentContext.get()
  def stream: DStream[InternalRow]
  def setValidTime(time: Time): Unit = {
    validTime = time
  }
}

Source File: TestOutputStream.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils

import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ArrayBuffer[Seq[T]] = ArrayBuffer[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output += collected
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
}

Source File: ExistingDStream.scala From spark-cep with Apache License 2.0

5 votes

package org.apache.spark.sql.streaming

import org.apache.spark.rdd.{EmptyRDD, RDD}
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics}
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream


private[streaming]
case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow])
    extends SparkPlan with StreamPlan {

  def children = Nil

  override def doExecute() = {
    assert(validTime != null)
    Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime))
      .asInstanceOf[Option[RDD[InternalRow]]]
      .getOrElse(new EmptyRDD[InternalRow](sparkContext))
  }
}

Source File: SavingStream.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.convert

import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService}
import com.kakao.mango.text.ThreadSafeDateFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.{Row, DataFrame}
import org.apache.spark.sql.hive.HiveContext
import org.apache.spark.streaming.Time
import org.apache.spark.streaming.dstream.DStream

import java.util.concurrent.{Future => JFuture}
import scala.reflect.runtime.universe.TypeTag

object SavingStream {
  val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd")
  val hh = ThreadSafeDateFormat("HH")
  val mm = ThreadSafeDateFormat("mm")
  val m0 = (ms: Long) => mm(ms).charAt(0) + "0"
}


  @transient var executor: RichExecutorService = _

  def ex: RichExecutorService = {
    if (executor == null) {
      this.synchronized {
        if (executor == null) {
          executor = new RichExecutorService(es.get())
        }
      }
    }
    executor
  }

  def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = {
    stream.foreachRDD { (rdd, time) =>
      ex.submit {
        toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*)
      }
    }
  }

  def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms))
    }
  }

  def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms))
    }
  }

  def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms))
    }
  }

  def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = {
    saveAsPartitionedTable(table, path, format) { time =>
      val ms = time.milliseconds
      Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms))
    }
  }

}

class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) {
  override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd)
}

class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) {
  override def toDF(rdd: RDD[String]) = ctx.read.json(rdd)
}

class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) {
  import com.kakao.mango.json._

  override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson))
}

class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) {
  override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema)
}

Source File: ScalaCheckStreamingTest.scala From sscheck with Apache License 2.0

5 votes

package es.ucm.fdi.sscheck.spark.streaming

import org.junit.runner.RunWith
import org.specs2.runner.JUnitRunner
import org.specs2.ScalaCheck
import org.specs2.execute.{AsResult, Result}

import org.scalacheck.{Prop, Gen}
import org.scalacheck.Arbitrary.arbitrary

import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.{Duration}
import org.apache.spark.streaming.dstream.DStream

import es.ucm.fdi.sscheck.prop.tl.Formula._
import es.ucm.fdi.sscheck.prop.tl.DStreamTLProperty
import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._

@RunWith(classOf[JUnitRunner])
class ScalaCheckStreamingTest 
  extends org.specs2.Specification  
  with DStreamTLProperty
  with org.specs2.matcher.ResultMatchers
  with ScalaCheck {
    
  override def sparkMaster : String = "local[5]"
  override def batchDuration = Duration(350)
  override def defaultParallelism = 4  
  
  def is = 
    sequential ^ s2"""
    Simple properties for Spark Streaming
      - where the first property is a success $prop1
      - where a simple property for DStream.count is a success ${countProp(_.count)}
      - where a faulty implementation of the DStream.count is detected ${countProp(faultyCount) must beFailing}
    """    
      
  def prop1 = {
    val batchSize = 30   
    val numBatches = 10 
    val dsgenSeqSeq1 = {
      val zeroSeqSeq = Gen.listOfN(numBatches,  Gen.listOfN(batchSize, 0)) 
      val oneSeqSeq = Gen.listOfN(numBatches, Gen.listOfN(batchSize, 1))
      Gen.oneOf(zeroSeqSeq, oneSeqSeq)  
    } 
    type U = (RDD[Int], RDD[Int])
    
    forAllDStream[Int, Int](
      "inputDStream" |: dsgenSeqSeq1)(
      (inputDs : DStream[Int]) => {  
        val transformedDs = inputDs.map(_+1)
        transformedDs
      })(always ((u : U) => {
          val (inputBatch, transBatch) = u
          inputBatch.count === batchSize and 
          inputBatch.count === transBatch.count and
          (inputBatch.intersection(transBatch).isEmpty should beTrue) and
          ( inputBatch should foreachRecord(_ == 0) or 
            (inputBatch should foreachRecord(_ == 1)) 
          )
        }) during numBatches 
      )}.set(minTestsOk = 10).verbose
      
  def faultyCount(ds : DStream[Double]) : DStream[Long] = 
    ds.count.transform(_.map(_ - 1))
    
  def countProp(testSubject : DStream[Double] => DStream[Long]) = {
    type U = (RDD[Double], RDD[Long])
    val numBatches = 10 
    forAllDStream[Double, Long]( 
      Gen.listOfN(numBatches,  Gen.listOfN(30, arbitrary[Double])))(
      testSubject
      )(always ((u : U) => {
         val (inputBatch, transBatch) = u
         transBatch.count === 1 and
         inputBatch.count === transBatch.first
      }) during numBatches
    )}.set(minTestsOk = 10).verbose
    
}

Source File: SimpleStreamingFormulas.scala From sscheck with Apache License 2.0

5 votes

package es.ucm.fdi.sscheck.spark.simple

import org.junit.runner.RunWith
import org.specs2.runner.JUnitRunner
import org.specs2.matcher.ResultMatchers
import org.scalacheck.Arbitrary.arbitrary
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Duration
import org.apache.spark.streaming.dstream.DStream
import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach
import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty}
import es.ucm.fdi.sscheck.prop.tl.Formula._
import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._
import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen}
import org.scalacheck.Gen
import es.ucm.fdi.sscheck.gen.PDStream
import es.ucm.fdi.sscheck.gen.Batch

@RunWith(classOf[JUnitRunner])
class SimpleStreamingFormulas 
  extends org.specs2.Specification 
  with DStreamTLProperty
  with org.specs2.ScalaCheck {
  
   // Spark configuration
  override def sparkMaster : String = "local[*]"
  override def batchDuration = Duration(50)
  override def defaultParallelism = 4  

  def is = 
    sequential ^ s2"""
    Simple demo Specs2 example for ScalaCheck properties with temporal
    formulas on Spark Streaming programs
      - Given a stream of integers
        When we filter out negative numbers
        Then we get only numbers greater or equal to 
          zero $filterOutNegativeGetGeqZero
      - where time increments for each batch $timeIncreasesMonotonically
      """
      
    def filterOutNegativeGetGeqZero = {
      type U = (RDD[Int], RDD[Int])
      val numBatches = 10
      val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Int]), 
                                numBatches)
      val formula = always(nowTime[U]{ (letter, time) => 
        val (_input, output) = letter
        output should foreachRecord {_ >= 0} 
      }) during numBatches
      
      forAllDStream(
      gen)(
      _.filter{ x => !(x < 0)})(
      formula)
    }.set(minTestsOk = 50).verbose

    def timeIncreasesMonotonically = {
      type U = (RDD[Int], RDD[Int])
      val numBatches = 10
      val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Int]))

      val formula = always(nextTime[U]{ (letter, time) =>
        nowTime[U]{ (nextLetter, nextTime) =>
          time.millis <= nextTime.millis
        }
      }) during numBatches-1

      forAllDStream(
      gen)(
      identity[DStream[Int]])(
      formula)
    }.set(minTestsOk = 10).verbose
}

Source File: StreamingFormulaDemo2.scala From sscheck with Apache License 2.0

5 votes

package es.ucm.fdi.sscheck.spark.demo

import org.junit.runner.RunWith
import org.specs2.runner.JUnitRunner
import org.specs2.ScalaCheck
import org.specs2.Specification
import org.specs2.matcher.ResultMatchers
import org.scalacheck.Arbitrary.arbitrary
import org.scalacheck.Gen

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Duration
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.DStream._

import scalaz.syntax.std.boolean._
    
import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach
import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty}
import es.ucm.fdi.sscheck.prop.tl.Formula._
import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen}
import es.ucm.fdi.sscheck.gen.BatchGenConversions._
import es.ucm.fdi.sscheck.gen.PDStreamGenConversions._
import es.ucm.fdi.sscheck.matcher.specs2.RDDMatchers._

@RunWith(classOf[JUnitRunner])
class StreamingFormulaDemo2 
  extends Specification 
  with DStreamTLProperty
  with ResultMatchers
  with ScalaCheck {
  
  // Spark configuration
  override def sparkMaster : String = "local[*]"
  override def batchDuration = Duration(300)
  override def defaultParallelism = 3
  override def enableCheckpointing = true

  def is = 
    sequential ^ s2"""
    Check process to persistently detect and ban bad users
      - where a stateful implementation extracts the banned users correctly ${checkExtractBannedUsersList(listBannedUsers)}
      - where a trivial implementation ${checkExtractBannedUsersList(statelessListBannedUsers) must beFailing}
    """
  type UserId = Long
  
  def listBannedUsers(ds : DStream[(UserId, Boolean)]) : DStream[UserId] = 
    ds.updateStateByKey((flags : Seq[Boolean], maybeFlagged : Option[Unit]) =>
      maybeFlagged match {
        case Some(_) => maybeFlagged  
        case None => flags.contains(false) option {()}
      } 
    ).transform(_.keys)
      
  def statelessListBannedUsers(ds : DStream[(UserId, Boolean)]) : DStream[UserId] =
    ds.map(_._1)
    
  def checkExtractBannedUsersList(testSubject : DStream[(UserId, Boolean)] => DStream[UserId]) = {
    val batchSize = 20 
    val (headTimeout, tailTimeout, nestedTimeout) = (10, 10, 5) 
    val (badId, ids) = (15L, Gen.choose(1L, 50L))   
    val goodBatch = BatchGen.ofN(batchSize, ids.map((_, true)))
    val badBatch = goodBatch + BatchGen.ofN(1, (badId, false))
    val gen = BatchGen.until(goodBatch, badBatch, headTimeout) ++ 
               BatchGen.always(Gen.oneOf(goodBatch, badBatch), tailTimeout)
    
    type U = (RDD[(UserId, Boolean)], RDD[UserId])
    val (inBatch, outBatch) = ((_ : U)._1, (_ : U)._2)
    
    val formula = {
      val badInput = at(inBatch)(_ should existsRecord(_ == (badId, false)))
      val allGoodInputs = at(inBatch)(_ should foreachRecord(_._2 == true))
      val noIdBanned = at(outBatch)(_.isEmpty)
      val badIdBanned = at(outBatch)(_ should existsRecord(_ == badId))
      
      ( ( allGoodInputs and noIdBanned ) until badIdBanned on headTimeout ) and
      ( always { badInput ==> (always(badIdBanned) during nestedTimeout) } during tailTimeout )  
    }  
    
    forAllDStream(    
      gen)(
      testSubject)( 
      formula)
  }.set(minTestsOk = 10).verbose

}

Source File: StreamingFormulaDemo1.scala From sscheck with Apache License 2.0

5 votes

package es.ucm.fdi.sscheck.spark.demo

import org.junit.runner.RunWith
import org.specs2.runner.JUnitRunner
import org.specs2.ScalaCheck
import org.specs2.Specification
import org.specs2.matcher.ResultMatchers
import org.scalacheck.Arbitrary.arbitrary

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Duration
import org.apache.spark.streaming.dstream.DStream

import es.ucm.fdi.sscheck.spark.streaming.SharedStreamingContextBeforeAfterEach
import es.ucm.fdi.sscheck.prop.tl.{Formula,DStreamTLProperty}
import es.ucm.fdi.sscheck.prop.tl.Formula._
import es.ucm.fdi.sscheck.gen.{PDStreamGen,BatchGen}

@RunWith(classOf[JUnitRunner])
class StreamingFormulaDemo1 
  extends Specification 
  with DStreamTLProperty
  with ResultMatchers
  with ScalaCheck {
  
  // Spark configuration
  override def sparkMaster : String = "local[*]"
  override def batchDuration = Duration(150)
  override def defaultParallelism = 4  

  def is = 
    sequential ^ s2"""
    Simple demo Specs2 example for ScalaCheck properties with temporal
    formulas on Spark Streaming programs
      - where a simple property for DStream.count is a success ${countForallAlwaysProp(_.count)}     
      - where a faulty implementation of the DStream.count is detected ${countForallAlwaysProp(faultyCount) must beFailing}
    """
      
  def faultyCount(ds : DStream[Double]) : DStream[Long] = 
    ds.count.transform(_.map(_ - 1))
      
  def countForallAlwaysProp(testSubject : DStream[Double] => DStream[Long]) = {
    type U = (RDD[Double], RDD[Long])
    val (inBatch, transBatch) = ((_ : U)._1, (_ : U)._2)
    val numBatches = 10
    val formula : Formula[U] = always { (u : U) =>
      transBatch(u).count === 1 and
      inBatch(u).count === transBatch(u).first 
    } during numBatches

    val gen = BatchGen.always(BatchGen.ofNtoM(10, 50, arbitrary[Double]), numBatches)
    
    forAllDStream(
      gen)(
      testSubject)(
      formula)
  }.set(minTestsOk = 10).verbose  
  
}

Source File: TestOutputStream.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}
import java.util.concurrent.ConcurrentLinkedQueue

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output.add(collected)
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
}

Source File: StreamingActionBase.scala From spark-testing-base with Apache License 2.0

5 votes

package com.holdenkarau.spark.testing

import org.apache.spark.streaming.TestStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.scheduler.{
  StreamingListenerBatchCompleted, StreamingListener}
import org.apache.spark.streaming.util.TestManualClock
import org.scalatest.Suite

import scala.reflect.ClassTag


  def runAction[U: ClassTag](input: Seq[Seq[U]], operation: DStream[U] => Unit) {

    val numBatches_ = input.size
    withStreamingContext(setupStream[U](input, operation)) { ssc =>
      runActionStream(ssc, numBatches_)
    }
  }

  private def withStreamingContext(outputStreamSSC: TestStreamingContext)
      (block: TestStreamingContext => Unit): Unit = {
    try {
      block(outputStreamSSC)
    } finally {
      try {
        outputStreamSSC.stop(stopSparkContext = false)
      } catch {
        case e: Throwable =>
          logError("Error stopping StreamingContext", e)
      }
    }
  }

  private def setupStream[U: ClassTag](input: Seq[Seq[U]],
      operation: DStream[U] => Any): TestStreamingContext = {

    // Create TestStreamingContext
    val ssc = new TestStreamingContext(sc, batchDuration)
    ssc.addStreamingListener(batchCountListener)
    if (checkpointDir != null) {
      ssc.checkpoint(checkpointDir)
    }

    // Setup the stream computation
    val inputStream = createTestInputStream(sc, ssc, input)
    operation(inputStream)
    ssc
  }

  private def runActionStream(ssc: TestStreamingContext, numBatches: Int) {
    assert(numBatches > 0, "Number of batches to run stream computation is zero")
    batchCountListener.batchCount = 0

    // Start computation
    ssc.start()

    // Advance manual clock
    val clock = ssc.getScheduler().clock.asInstanceOf[TestManualClock]
    logInfo("Manual clock before advancing = " + clock.currentTime())
    if (actuallyWait) {
      for (i <- 1 to numBatches) {
        logInfo("Actually waiting for " + batchDuration)
        clock.addToTime(batchDuration.milliseconds)
        Thread.sleep(batchDuration.milliseconds)
      }
    } else {
      clock.addToTime(numBatches * batchDuration.milliseconds)
    }
    logInfo("Manual clock after advancing = " + clock.currentTime())

    // wait for expected number of batches to execute
    val startTime = System.currentTimeMillis()
    while (batchCountListener.batchCount < numBatches &&
      System.currentTimeMillis() - startTime < maxWaitTimeMillis) {
      logInfo(s"batches: run = ${batchCountListener.batchCount} " +
        s"target = ${numBatches}")
      ssc.awaitTerminationOrTimeout(50)
    }
    val timeTaken = System.currentTimeMillis() - startTime
    logInfo("Output generated in " + timeTaken + " milliseconds")

    Thread.sleep(100) // Give some time for the forgetting old RDDs to complete
  }

}

class BatchCountListener extends StreamingListener {
  var batchCount = 0

  override def onBatchCompleted(
    batchCompleted: StreamingListenerBatchCompleted): Unit = {
    batchCount = batchCount + 1
  }
}

Source File: RabbitMQDistributedInput.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.input.rabbitmq

import java.io.{Serializable => JSerializable}

import com.stratio.sparta.plugin.input.rabbitmq.handler.MessageHandler
import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.rabbitmq.RabbitMQUtils._
import org.apache.spark.streaming.rabbitmq.distributed.RabbitMQDistributedKey
import org.apache.spark.streaming.rabbitmq.models.ExchangeAndRouting

import scala.language.implicitConversions
import scala.util.Try

object RabbitMQDistributedInput {
  //Keys from UI
  val DistributedPropertyKey = "distributedProperties"
  val QueuePropertyKey = "distributedQueue"
  val ExchangeNamePropertyKey = "distributedExchangeName"
  val ExchangeTypePropertyKey = "distributedExchangeType"
  val RoutingKeysPropertyKey = "distributedRoutingKeys"
  val HostPropertyKey = "hosts"

  //Default values
  val QueueDefaultValue = "queue"
  val HostDefaultValue = "localhost"
}

class RabbitMQDistributedInput(properties: Map[String, JSerializable])
  extends Input(properties) with RabbitMQGenericProps {

  import RabbitMQDistributedInput._


  def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = {
    val messageHandler = MessageHandler(properties).handler
    val params = propsWithStorageLevel(sparkStorageLevel)
    createDistributedStream(ssc, getKeys(params), params, messageHandler)
  }

  def getKeys(rabbitMQParams: Map[String, String]): Seq[RabbitMQDistributedKey] = {
    val items = Try(properties.getMapFromJsoneyString(DistributedPropertyKey))
      .getOrElse(Seq.empty[Map[String, String]])
    for (item <- items) yield getKey(item, rabbitMQParams)
  }

  def getKey(params: Map[String, String], rabbitMQParams: Map[String, String]): RabbitMQDistributedKey = {
    val exchangeAndRouting = ExchangeAndRouting(
      params.get(ExchangeNamePropertyKey).notBlank,
      params.get(ExchangeTypePropertyKey).notBlank,
      params.get(RoutingKeysPropertyKey).notBlank
    )
    val hosts = HostPropertyKey -> params.get(HostPropertyKey).notBlankWithDefault(HostDefaultValue)
    val queueName = params.get(QueuePropertyKey).notBlankWithDefault(QueueDefaultValue)

    RabbitMQDistributedKey(
      queueName,
      exchangeAndRouting,
      rabbitMQParams + hosts
    )
  }
}

Source File: WordCount.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.streaming.application

import com.intel.hibench.common.streaming.UserVisitParser
import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{StateSpec, State}

class WordCount() extends BenchBase {

  override def process(lines: DStream[(Long, String)], config: SparkBenchConfig) = {
    val reportTopic = config.reporterTopic
    val brokerList = config.brokerList

    // Project Line to UserVisit, the output means "[IP, [Strat Time, Count]]"
    val parsedLine: DStream[(String, (Long, Int))] = lines.map(line => {
      val userVisit = UserVisitParser.parse(line._2)
      (userVisit.getIp, (line._1, 1))
    })

    // Define state mapping function
    val mappingFunc = (ip: String, one: Option[(Long, Int)], state: State[Int]) => {
      if (!one.isDefined) {
        throw new Exception("input value is not defined. It should not happen as we don't use timeout function.")
      }
      val sum = one.get._2 + state.getOption.getOrElse(0)
      state.update(sum)
      (ip, one.get._1)
    }


    val wordCount = parsedLine.mapWithState(StateSpec.function(mappingFunc))

    wordCount.foreachRDD(rdd => rdd.foreachPartition(partLines => {
      val reporter = new KafkaReporter(reportTopic, brokerList)
      partLines.foreach { case (word, inTime) =>
        val outTime = System.currentTimeMillis()
        reporter.report(inTime, outTime)
        if (config.debugMode) println(word + ": " + inTime + ", " + outTime )
      }
    }))
  }
}

Source File: SolRSupport.scala From Taxi360 with Apache License 2.0

5 votes

package com.cloudera.sa.taxi360.streaming.ingestion.solr

import java.net.{ConnectException, SocketException}
import java.util

import org.apache.solr.client.solrj.impl.CloudSolrServer
import org.apache.solr.client.solrj.request.UpdateRequest
import org.apache.solr.common.{SolrException, SolrInputDocument}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream


object SolRSupport {
  def indexDStreamOfDocs(zkHost:String,
                         collection:String,
                         batchSize:Int,
                         docDStream:DStream[SolrInputDocument]): Unit ={
    docDStream.foreachRDD(docRdd => {
      indexDoc(zkHost, collection, batchSize, docRdd)
    })
  }

  def indexDoc(zkHost:String,
               collection:String,
               batchSize:Int,
               docRdd:RDD[SolrInputDocument]): Unit = {
    docRdd.foreachPartition(it => {
      val solrServer = CloudSolRServerBuilder.build(zkHost)

      val batch = new util.ArrayList[SolrInputDocument]()

      while (it.hasNext) {
        val inputDoc = it.next()
        batch.add(inputDoc)
        if (batch.size() >= batchSize)
          sendBatchToSolr(solrServer, collection, batch)
      }
      if (!batch.isEmpty())
        sendBatchToSolr(solrServer, collection, batch)
    })
  }

  def sendBatchToSolr( solrServer: CloudSolrServer,
                       collection:String,
                       batch:util.Collection[SolrInputDocument]) {
    val req = new UpdateRequest()
    req.setParam("collection", collection)

    req.add(batch)
    try {
      solrServer.request(req)
    } catch  {
      case e:Exception => {
        if (shouldRetry(e)) {
          try {
            Thread.sleep(2000)
          } catch {
            case e1: InterruptedException => {
              Thread.interrupted()
            }
          }

          try {
            solrServer.request(req)
          } catch {
            case e1: Exception => {

              if (e1.isInstanceOf[RuntimeException]) {
                throw e1.asInstanceOf[RuntimeException]
              } else {
                throw new RuntimeException(e1)
              }
            }
          }
        } else {
          if (e.isInstanceOf[RuntimeException]) {
            throw e.asInstanceOf[RuntimeException]
          } else {
            throw new RuntimeException(e)
          }
        }
      }
    } finally {
      batch.clear()
    }
  }

  def shouldRetry( exc:Exception): Boolean = {
    val rootCause = SolrException.getRootCause(exc)
    rootCause.isInstanceOf[ConnectException] ||
      rootCause.isInstanceOf[SocketException]
  }
}

Source File: KafkaStreamingDemo.scala From MaxCompute-Spark with Apache License 2.0

5 votes

package com.aliyun.odps.spark.examples.streaming.kafka

import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.sql.SparkSession
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}
import org.apache.spark.streaming.{Seconds, StreamingContext}

object KafkaStreamingDemo {
  def main(args: Array[String]): Unit = {
    val spark = SparkSession
      .builder()
      .appName("KafkaStreamingDemo")
      .getOrCreate()

    val ssc = new StreamingContext(spark.sparkContext, Seconds(5))

    // 请使用OSS作为Checkpoint存储
    ssc.checkpoint("oss://bucket/checkpointDir/")

    // kafka配置参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "192.168.1.1:9200,192.168.1.2:9200,192.168.1.3:9200",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "testGroupId",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    val topics = Set("event_topic")
    val recordDstream: InputDStream[ConsumerRecord[String, String]] =
      KafkaUtils.createDirectStream[String, String](
        ssc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
      )


    val dstream = recordDstream.map(f => (f.key(), f.value()))
    val data: DStream[String] = dstream.map(_._2)
    val wordsDStream: DStream[String] = data.flatMap(_.split(" "))
    val wordAndOneDstream: DStream[(String, Int)] = wordsDStream.map((_, 1))
    val result: DStream[(String, Int)] = wordAndOneDstream.reduceByKey(_ + _)
    result.print()

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: Kafka2OdpsDemo.scala From MaxCompute-Spark with Apache License 2.0

5 votes

package com.aliyun.odps.spark.examples.streaming.kafka

import com.aliyun.odps.spark.examples.streaming.common.SparkSessionSingleton
import org.apache.kafka.clients.consumer.ConsumerRecord
import org.apache.kafka.common.serialization.StringDeserializer

import org.apache.spark.SparkConf
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.apache.spark.streaming.kafka010.{ConsumerStrategies, KafkaUtils, LocationStrategies}

object Kafka2OdpsDemo {
  def main(args: Array[String]): Unit = {
    val sparkConf = new SparkConf().setAppName("test")
    val ssc = new StreamingContext(sparkConf, Seconds(10))

    // 请使用OSS作为Checkpoint存储,修改为有效OSS路径。OSS访问文档请参考 https://github.com/aliyun/MaxCompute-Spark/wiki/08.-Oss-Access%E6%96%87%E6%A1%A3%E8%AF%B4%E6%98%8E
    ssc.checkpoint("oss://bucket/checkpointdir")

    // kafka配置参数
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> "localhost:9092",
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> "testGroupId",
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false: java.lang.Boolean)
    )

    // 创建kafka dstream
    val topics = Set("test")
    val recordDstream: InputDStream[ConsumerRecord[String, String]] =
      KafkaUtils.createDirectStream[String, String](
        ssc,
        LocationStrategies.PreferConsistent,
        ConsumerStrategies.Subscribe[String, String](topics, kafkaParams)
      )
    val dstream = recordDstream.map(f => (f.key(), f.value()))
    // 解析kafka数据并写入odps
    val data: DStream[String] = dstream.map(_._2)
    val wordsDStream: DStream[String] = data.flatMap(_.split(" "))
    wordsDStream.foreachRDD(rdd => {
      val spark = SparkSessionSingleton.getInstance(rdd.sparkContext.getConf)
      import spark.implicits._

      rdd.toDF("id").write.mode("append").saveAsTable("test_table")
    })

    ssc.start()
    ssc.awaitTermination()
  }
}

Source File: ClusteringEvaluator.scala From streamDM with Apache License 2.0

5 votes

package org.apache.spark.streamdm.evaluation

import math._

import org.apache.spark.streamdm.core._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.rdd.RDD


object ClusteringEvaluationUtil {

  def computeAllCentroids(input: RDD[(Example,Double)]): 
                RDD[(Double,Example, Int)] =
    input.map{case (e,c) => (c,e)}.map{case (c,e) => (c,Array(e))}.
      reduceByKey((x,y) => x++y).map{case (c,e) => {
        val clSize = e.length
        val clSum = e.foldLeft(new Example(new NullInstance))(
          (a,x) =>  a.in match {
            case NullInstance() => new Example(x.in.map(x=>x))
            case _ => new Example(a.in.add(x.in))
          })        
        if(clSize>1)
          (c,new Example(clSum.in.map(x=>x/clSize)),clSize)
        else
          (c,clSum,1)
      }}
}

Source File: SWNearestNeighbors.scala From streamDM with Apache License 2.0

5 votes

package org.apache.spark.streamdm.outlier

import com.github.javacliparser.{FlagOption, IntOption}
import org.apache.spark.internal.Logging
import org.apache.spark.streamdm.core.Example
import org.apache.spark.streamdm.core.specification.ExampleSpecification
import org.apache.spark.streaming.dstream.DStream

import scala.collection.mutable.Queue


  def outlierness(example: Example): Double = {
    val distances = window.map(p => p.in.distanceTo(example.in))

    if(!distances.isEmpty) {
      val aggDistance = distances.reduce((d1, d2) => (d1 + d2)) / distances.size

      if(debug)
        logInfo("outlierness, %f, {%s}, %s, %d".format(aggDistance,
          example.in.getFeatureIndexArray().map(ins => ins._1).mkString(";"),
          example.out.getFeatureIndexArray().map(ins => ins._1).mkString(" "),
          distances.size))

      aggDistance
    } else {
      0.0
    }
  }
}

Source File: StreamingJob.scala From confluent-platform-spark-streaming with Apache License 2.0

5 votes

package example

import com.typesafe.config.ConfigFactory
import io.confluent.kafka.serializers.KafkaAvroDecoder
import kafka.serializer.StringDecoder
import org.apache.log4j.{Level, Logger}
import org.apache.spark.sql.SQLContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkContext, SparkConf}


object StreamingJob extends App {

  // Get job configuration
  val config = ConfigFactory.load()

  Logger.getLogger("example").setLevel(Level.toLevel(config.getString("loglevel")))
  private val logger = Logger.getLogger(getClass)

  // Spark config and contexts
  val sparkMaster = config.getString("spark.master")
  val sparkConf = new SparkConf()
    .setMaster(sparkMaster)
    .setAppName("StreamingExample")
    .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val sc = new SparkContext(sparkConf)
  val batchInterval = config.getInt("spark.batch.interval")
  val ssc = new StreamingContext(sc, Seconds(batchInterval))

  // Create Kafka stream
  val groupId = config.getString("kafka.group.id")
  val topic = config.getString("topic")
  val kafkaParams = Map(
    "bootstrap.servers" -> config.getString("kafka.bootstrap.servers"),
    "schema.registry.url" -> config.getString("kafka.schema.registry.url"),
    "group.id" -> groupId
  )

  @transient val kafkaStream: DStream[(String, Object)] =
      KafkaUtils.createDirectStream[String, Object, StringDecoder, KafkaAvroDecoder](
        ssc, kafkaParams, Set(topic)
      )

  // Load JSON strings into DataFrame
  kafkaStream.foreachRDD { rdd =>
    // Get the singleton instance of SQLContext
    val sqlContext = SQLContext.getOrCreate(rdd.sparkContext)
    import sqlContext.implicits._

    val topicValueStrings = rdd.map(_._2.toString)
    val df = sqlContext.read.json(topicValueStrings)

    df.printSchema()
    println("DataFrame count: " + df.count())
    df.take(1).foreach(println)
  }

  ssc.start()
  ssc.awaitTermination()

}

Source File: Repartition.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.streaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig

import org.apache.spark.streaming.dstream.DStream

class Repartition() extends BenchBase {

  override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = {
    val reportTopic = config.reporterTopic
    val brokerList = config.brokerList

    lines.repartition(config.coreNumber).foreachRDD(rdd => rdd.foreachPartition( partLines => {
      val reporter = new KafkaReporter(reportTopic, brokerList)
      partLines.foreach{ case (inTime , content) =>
        val outTime = System.currentTimeMillis()
        reporter.report(inTime,outTime)
        if(config.debugMode) {
          println("Event: " + inTime + ", " + outTime)
        }
      }
    }))
  }
}

Source File: Identity.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.streaming.application

import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig

import org.apache.spark.streaming.dstream.DStream

class Identity() extends BenchBase {

  override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = {
    val reportTopic = config.reporterTopic
    val brokerList = config.brokerList

    lines.foreachRDD(rdd => rdd.foreachPartition( partLines => {
      val reporter = new KafkaReporter(reportTopic, brokerList)
      partLines.foreach{ case (inTime , content) =>
        val outTime = System.currentTimeMillis()
        reporter.report(inTime, outTime)
        if(config.debugMode) {
          println("Event: " + inTime + ", " + outTime)
        }
      }
    }))
  }
}

Source File: FixWindow.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.streaming.application

import com.intel.hibench.common.streaming.UserVisitParser
import com.intel.hibench.common.streaming.metrics.KafkaReporter
import com.intel.hibench.sparkbench.streaming.util.SparkBenchConfig
import org.apache.spark.streaming.Duration
import org.apache.spark.streaming.dstream.DStream

class FixWindow(duration: Long, slideStep: Long) extends BenchBase {

  override def process(lines: DStream[(Long, String)], config: SparkBenchConfig): Unit = {
    val reportTopic = config.reporterTopic
    val brokerList = config.brokerList

    lines.window(Duration(duration), Duration(slideStep)).map{
      case (inTime, line) => {
        val uv = UserVisitParser.parse(line)
        (uv.getIp, (inTime, 1))
      }
    }.reduceByKey((value, result) => {
      // maintain the min time of this window and count record number
      (Math.min(value._1, result._1), value._2 + result._2)
    }).foreachRDD( rdd => rdd.foreachPartition( results => {

      // report back to kafka
      val reporter = new KafkaReporter(reportTopic, brokerList)
      val outTime = System.currentTimeMillis()

      results.foreach(res => {
        (1 to (res._2._2)).foreach { _ =>
          reporter.report(res._2._1, outTime)
          if(config.debugMode) {
            println("Event: " + res._2._1 + ", " + outTime)
          }
        }
      })
    }))
  }
}

Source File: FlumeInput.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.input.flume

import java.io.Serializable
import java.net.InetSocketAddress

import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.flume.FlumeUtils

class FlumeInput(properties: Map[String, Serializable]) extends Input(properties) {

  val DEFAULT_FLUME_PORT = 11999
  val DEFAULT_ENABLE_DECOMPRESSION = false
  val DEFAULT_MAXBATCHSIZE = 1000
  val DEFAULT_PARALLELISM = 5

  def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = {

    if (properties.getString("type").equalsIgnoreCase("pull")) {
      FlumeUtils.createPollingStream(
        ssc,
        getAddresses,
        storageLevel(sparkStorageLevel),
        maxBatchSize,
        parallelism
      ).map(data => Row(data.event.getBody.array))
    } else {
      // push
      FlumeUtils.createStream(
        ssc, properties.getString("hostname"),
        properties.getString("port").toInt,
        storageLevel(sparkStorageLevel),
        enableDecompression
      ).map(data => Row(data.event.getBody.array))
    }
  }

  private def getAddresses: Seq[InetSocketAddress] =
    properties.getMapFromJsoneyString("addresses")
      .map(values => (values.get("host"), values.get("port")))
      .map {
        case (Some(address), None) =>
          new InetSocketAddress(address, DEFAULT_FLUME_PORT)
        case (Some(address), Some(port)) =>
          new InetSocketAddress(address, port.toInt)
        case _ =>
          throw new IllegalStateException(s"Invalid configuration value for addresses : ${properties.get("addresses")}")
      }

  private def enableDecompression: Boolean =
    properties.hasKey("enableDecompression") match {
      case true => properties.getBoolean("enableDecompression")
      case false => DEFAULT_ENABLE_DECOMPRESSION
    }

  private def parallelism: Int = {
    properties.hasKey("parallelism") match {
      case true => properties.getString("parallelism").toInt
      case false => DEFAULT_PARALLELISM
    }
  }

  private def maxBatchSize: Int =
    properties.hasKey("maxBatchSize") match {
      case true => properties.getString("maxBatchSize").toInt
      case false => DEFAULT_MAXBATCHSIZE
    }
}

Source File: TwitterPopularTagsTest.scala From apache-spark-test with Apache License 2.0

5 votes

package com.github.dnvriend.spark.streaming.twitter

import com.github.dnvriend.TestSpec
import com.github.dnvriend.spark.Tweet
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.dstream.{ DStream, ReceiverInputDStream }
import org.apache.spark.streaming.twitter.TwitterUtils
import org.scalatest.Ignore
import pprint.Config.Colors.PPrintConfig
import pprint._
import twitter4j.Status

// see: https://dev.twitter.com/streaming/overview
// see: https://dev.twitter.com/streaming/public
// see: https://support.twitter.com/articles/20174643
// see: https://github.com/apache/bahir/blob/master/streaming-twitter/examples/src/main/scala/org/apache/spark/examples/streaming/twitter/TwitterPopularTags.scala
// see: http://blog.originate.com/blog/2014/06/15/idiomatic-scala-your-options-do-not-match/

@Ignore
class TwitterPopularTagsTest extends TestSpec {
  it should "find popular tags" in withStreamingContext(2, await = true) { spark => ssc =>

    //    val filters = Array("#scala", "#akka", "#spark", "@scala", "@akka", "@spark")
    val filters = Array("#summercamp", "#akka", "#scala", "#fastdata", "#spark", "#hadoop")
    val stream: ReceiverInputDStream[Status] = TwitterUtils.createStream(ssc, None, filters)

    val msgs: DStream[Tweet] =
      stream
        .map(Tweet(_))

    msgs.foreachRDD { rdd =>
      rdd.take(10).foreach(pprint.pprintln)
    }

    val hashTags: DStream[String] =
      stream
        .filter(_.getLang == "en")
        .flatMap(status => status.getText.split(" ").filter(_.startsWith("#")))

    val topCounts60 =
      hashTags
        .map((_, 1))
        .reduceByKeyAndWindow(_ + _, Seconds(60))
        .map { case (topic, count) => (count, topic) }
        .transform(_.sortByKey(ascending = false))

    val topCounts10 =
      hashTags
        .map((_, 1))
        .reduceByKeyAndWindow(_ + _, Seconds(10))
        .map { case (topic, count) => (count, topic) }
        .transform(_.sortByKey(false))

    topCounts60.foreachRDD(rdd => {
      val topList = rdd.take(10)
      pprint.pprintln("\nPopular topics in last 60 seconds (%s total):".format(rdd.count()))
      topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
    })

    topCounts10.foreachRDD(rdd => {
      val topList = rdd.take(10)
      pprint.pprintln("\nPopular topics in last 10 seconds (%s total):".format(rdd.count()))
      topList.foreach { case (count, tag) => println("%s (%s tweets)".format(tag, count)) }
    })

    ssc.start()
  }
}

Source File: KafkaFlowExample.scala From kafka-scala-api with Apache License 2.0

5 votes

package com.example.flow

import org.apache.spark.streaming.dstream.DStream._
import org.apache.spark.streaming.dstream.{DStream, InputDStream}
import org.joda.time.DateTime
import org.json4s.DefaultFormats
import org.json4s.jackson.JsonMethods._

import scala.util.Try

case class Purchase(item_id: String, amount: BigDecimal, time: Long)
case class Key(item_id: String, time: DateTime)
case class Summary(item_id: String, time: DateTime, total: BigDecimal)

object KafkaFlowExample {
  implicit val formats = DefaultFormats

  def extract(message: String): Option[(Key, BigDecimal)] = {
    for {
      parsed <- Try(parse(message)).toOption
      purchase <- parsed.extractOpt[Purchase]
    } yield {
      val datetime = new DateTime(purchase.time)
      val roundedTime = datetime.withMinuteOfHour(0).withSecondOfMinute(0).withMillisOfSecond(0)
      Key(purchase.item_id, roundedTime) -> purchase.amount
    }
  }

  def transformStream(stream: InputDStream[String]): DStream[Summary] = {
    stream
      .flatMap(extract)
      .reduceByKey(_ + _)
      .map { case (key, amount) =>
        Summary(key.item_id, key.time, amount)
      }
  }
}

Source File: batchStream.scala From Clustering4Ever with Apache License 2.0

5 votes

package org.clustering4ever.spark.clustering.batchstream
	  

import scala.collection.mutable.ArrayBuffer
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.rdd.RDD
import org.clustering4ever.spark.streamclustering.{Prototype, PointObj}

class BatchStream(
    var voisinage: Int, 
    var decayFactor: Double, 
    var lambdaAge : Double, 
    var nbNodesToAdd: Int, 
    var minWeight: Double   , 
    var maxAge: Int, 
    var alphaErr: Double, 
    var d: Double) extends Serializable {
  
  def this() = this(voisinage = 0, decayFactor = 0.9, lambdaAge = 1.2, nbNodesToAdd = 3, minWeight = 1, maxAge = 250, alphaErr = 0.5, d = 0.99)
  
  var model: BatchStreamModel = new BatchStreamModel()
      
  def getModel: BatchStreamModel = model

  
  // Initializing the model.
  def initModelObj(txt: RDD[Array[Double]], dim: Int): BatchStream = { 
    val nodes2 = txt.take(2)
    val node1 = nodes2(0)
    val node2 = nodes2(1)
    model.init2NodesObj(node1, node2, dim, 1)
    this 
  }  
 

  // Training on the model.
  def trainOnObj(data: DStream[PointObj], gstream: BatchStream, dirSortie: String, dim: Int, nbWind: Int) = {
    val timeUpdates = ArrayBuffer[Long](0L)
    var kk = 1
     data.foreachRDD{ rdd =>
      if ( rdd.count() > 0 ) {
        val initialTimeUpdate = System.currentTimeMillis()
        println("\n<<<<<<<<<<<<<<<< >>>>>>>>>>>>>>>--BatchStream--(batch: " + kk + " )..." + " rdd.count: " + rdd.count() + " \n")
        
        // Update model without var affectation
        model = model.updateObj(rdd, gstream, kk, dim)

        timeUpdates += (timeUpdates(timeUpdates.size - 1) + (System.currentTimeMillis() - initialTimeUpdate))
        if (timeUpdates.length > 100) timeUpdates.remove(0)
        
        if ( (kk == 1) | (kk == nbWind / 9) | (kk == 2 * nbWind / 9) | (kk == 3 * nbWind / 9) | (kk == 4 * nbWind / 9) | (kk == 5 * nbWind / 9) | (kk == 6 * nbWind / 9) | (kk == 7 * nbWind / 9) | (kk == 8 * nbWind / 9) | (kk > (8 * nbWind / 9) + 10 & kk % 10 == 0) | (kk >= nbWind - 2) ) {
          rdd.context.parallelize(model.toStringProto).saveAsTextFile(dirSortie+"/Prototypes-"+kk)
          rdd.context.parallelize(model.toStringOutdatedProto).saveAsTextFile(dirSortie+"/OutdatedProtos-"+kk)
          rdd.context.parallelize(model.edges).saveAsTextFile(dirSortie+"/Edges-"+kk)
          rdd.context.parallelize(model.clusterWeights).saveAsTextFile(dirSortie+"/Weights-"+kk)           
          rdd.context.parallelize(timeUpdates).saveAsTextFile(dirSortie+"/timeUpdates-"+kk)          
        }
        kk += 1
      }
      else println("-- BatchStream: empty rdd -- rdd.count : "+rdd.count())
    }
    model
  }

}

Source File: MetricImplicits.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.timeseries.timely

import java.io.PrintStream
import java.net.Socket
import java.nio.charset.StandardCharsets

import io.gzet.timeseries.SimpleConfig
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{Logging, Partitioner}

object MetricImplicits extends Logging with SimpleConfig {

  def nonNegativeMod(x: Int, mod: Int): Int = {
    val rawMod = x % mod
    rawMod + (if (rawMod < 0) mod else 0)
  }

  class MetricPartitioner(partitions: Int) extends Partitioner {
    require(partitions >= 0, s"Number of partitions ($partitions) cannot be negative.")

    override def numPartitions: Int = partitions

    override def getPartition(key: Any): Int = {
      val k = key.asInstanceOf[MetricKey]
      nonNegativeMod(k.metricName.hashCode, partitions)
    }
  }

  implicit class Metrics(rdd: RDD[Metric]) {

    val partitions = rdd.partitions.length
    val partitioner = new MetricPartitioner(partitions)

    def publish() = {
      val sSortedMetricRDD = rdd filter { metric =>
        metric.tags.nonEmpty
      } map { metric =>
        (MetricKey(metric.name, metric.time), metric)
      } repartitionAndSortWithinPartitions partitioner

      sSortedMetricRDD.values foreachPartition { it: Iterator[Metric] =>
        val sock = new Socket(timelyHost, timelyPort)
        val writer = new PrintStream(sock.getOutputStream, true, StandardCharsets.UTF_8.name)
        it foreach { metric =>
          writer.println(metric.toPut)
        }
        writer.flush()
      }
    }
  }


  implicit class MetricStream(stream: DStream[Metric]) {
    def publish() = {
      stream foreachRDD {
        rdd => rdd.publish()
      }
    }
  }
}

case class Metric(name: String, time: Long, value: Double, tags: Map[String, String], viz: Option[String] = None) {
  def toPut = {
    val vizMap = if(viz.isDefined) List("viz" -> viz.get) else List[(String, String)]()
    val strTags = vizMap.union(tags.toList).map({ case (k, v) =>
      s"$k=$v"
    }).mkString(" ")
    s"put $name $time $value $strTags"
  }
}

case class MetricKey(metricName: String, metricTime: Long)

object MetricKey {
  implicit def orderingByMetricDate[A <: MetricKey] : Ordering[A] = {
    Ordering.by(fk => (fk.metricName, fk.metricTime))
  }
}

Source File: GdeltTagger.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.tagging.gdelt

import java.text.SimpleDateFormat
import java.util.Date

import com.typesafe.config.ConfigFactory
import io.gzet.tagging.classifier.Classifier
import io.gzet.tagging.html.HtmlHandler
import io.gzet.tagging.html.HtmlHandler.Content
import org.apache.spark.Accumulator
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.util.LongAccumulator
import org.elasticsearch.spark._

class GdeltTagger() extends Serializable {

  val config = ConfigFactory.load().getConfig("io.gzet.kappa")
  val isoSdf = "yyyy-MM-dd HH:mm:ss"
  val esIndex = config.getString("gdeltIndex")
  val vectorSize = config.getInt("vectorSize")
  val minProba = config.getDouble("minProba")

  def predict(gdeltStream: DStream[String], batchId: LongAccumulator) = {

    // Extract HTML content
    val gdeltContent = fetchHtmlContent(gdeltStream)

    // Predict each RDD
    gdeltContent foreachRDD { batch =>

      batch.cache()
      val count = batch.count()

      if (count > 0) {

        if (Classifier.model.isDefined) {
          val labels = Classifier.model.get.labels

          // Predict HashTags using latest Twitter model
          val textRdd = batch.map(_.body.get)
          val predictions = Classifier.predictProbabilities(textRdd)
          val taggedGdelt = batch.zip(predictions) map { case (content, probabilities) =>
            val validLabels = probabilities filter { case (label, probability) =>
              probability > minProba
            }

            val labels = validLabels.toSeq
              .sortBy(_._2)
              .reverse
              .map(_._1)

            (content, labels)
          }

          // Saving articles to Elasticsearch
          taggedGdelt map { case (content, hashTags) =>
            gdeltToJson(content, hashTags.toArray)
          } saveToEs esIndex

        } else {

          // Saving articles to Elasticsearch
          batch map { content =>
            gdeltToJson(content, Array())
          } saveToEs esIndex
        }

      }

      batch.unpersist(blocking = false)
    }
  }

  private def gdeltToJson(content: Content, hashTags: Array[String]) = {
    val sdf = new SimpleDateFormat(isoSdf)
    Map(
      "time" -> sdf.format(new Date()),
      "body" -> content.body.get,
      "url" -> content.url,
      "tags" -> hashTags,
      "title" -> content.title
    )
  }

  private def fetchHtmlContent(urlStream: DStream[String]) = {
    urlStream.map(_ -> 1).groupByKey().map(_._1) mapPartitions { urls =>
      val sdf = new SimpleDateFormat(isoSdf)
      val htmlHandler = new HtmlHandler()
      val goose = htmlHandler.getGooseScraper
      urls map { url =>
        htmlHandler.fetchUrl(goose, url, sdf)
      }
    } filter { content =>
      content.isDefined &&
        content.get.body.isDefined &&
        content.get.body.get.length > 255
    } map { content =>
      content.get
    }
  }
}

Source File: KappaTagging.scala From Mastering-Spark-for-Data-Science with MIT License

5 votes

package io.gzet.tagging

import com.typesafe.config.ConfigFactory
import io.gzet.tagging.gdelt.GdeltTagger
import io.gzet.tagging.twitter.TwitterHIS
import org.apache.spark.SparkConf
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka.KafkaUtils
import org.apache.spark.streaming.twitter.TwitterUtils
import org.apache.spark.streaming.{Seconds, StreamingContext}
import twitter4j.Status
import twitter4j.auth.OAuthAuthorization
import twitter4j.conf.ConfigurationBuilder

object KappaTagging {

  final val config = ConfigFactory.load().getConfig("io.gzet.kappa")
  final val esNodes = config.getString("esNodes")
  final val batchSize = config.getInt("batchSize")

  def main(args: Array[String]) = {

    val sparkConf = new SparkConf().setAppName("GDELT Kappa tagging")
    val ssc = new StreamingContext(sparkConf, Seconds(batchSize))
    val sc = ssc.sparkContext

    // Create a counter that can be shared accross batches
    val batchId = sc.longAccumulator("GZET")

    val twitterStream = createTwitterStream(ssc, Array[String]())
    val twitterProcessor = new TwitterHIS()
    twitterProcessor.train(twitterStream, batchId)

    val gdeltStream = createGdeltStream(ssc)
    val gdeltProcessor = new GdeltTagger()
    gdeltProcessor.predict(gdeltStream, batchId)

    ssc.start()
    ssc.awaitTermination()
  }

  private def createTwitterStream(ssc: StreamingContext, filters: Array[String]): DStream[Status] = {
    TwitterUtils.createStream(
      ssc,
      getTwitterConfiguration,
      filters
    )
  }

  private def getTwitterConfiguration = {
    val builder = new ConfigurationBuilder()
    builder.setOAuthConsumerKey(config.getString("apiKey"))
    builder.setOAuthConsumerSecret(config.getString("apiSecret"))
    builder.setOAuthAccessToken(config.getString("tokenKey"))
    builder.setOAuthAccessTokenSecret(config.getString("tokenSecret"))
    val configuration = builder.build()
    Some(new OAuthAuthorization(configuration))
  }

  private def createGdeltStream(ssc: StreamingContext) = {
    val topics = Map(
      config.getString("kafkaTopic") -> config.getInt("kafkaTopicPartition")
    )
    KafkaUtils.createStream(
      ssc,
      config.getString("zkQuorum"),
      config.getString("kafkaGroupId"),
      topics
    ).map(_._2)
  }

}

Source File: TrendingHashtags.scala From dataproc-pubsub-spark-streaming with Apache License 2.0

5 votes

package demo

import java.nio.charset.StandardCharsets

import com.google.cloud.datastore._
import demo.DataStoreConverter.saveRDDtoDataStore
import demo.HashTagsStreaming.processTrendingHashTags
import org.apache.spark.SparkConf
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.pubsub.{PubsubUtils, SparkGCPCredentials}
import org.apache.spark.streaming.{Seconds, StreamingContext}


object TrendingHashtags {

  def createContext(projectID: String, windowLength: String, slidingInterval: String, checkpointDirectory: String)
    : StreamingContext = {

    // [START stream_setup]
    val sparkConf = new SparkConf().setAppName("TrendingHashtags")
    val ssc = new StreamingContext(sparkConf, Seconds(slidingInterval.toInt))

    // Set the checkpoint directory
    val yarnTags = sparkConf.get("spark.yarn.tags")
    val jobId = yarnTags.split(",").filter(_.startsWith("dataproc_job")).head
    ssc.checkpoint(checkpointDirectory + '/' + jobId)
    
    // Create stream
    val messagesStream: DStream[String] = PubsubUtils
      .createStream(
        ssc,
        projectID,
        None,
        "tweets-subscription",  // Cloud Pub/Sub subscription for incoming tweets
        SparkGCPCredentials.builder.build(), StorageLevel.MEMORY_AND_DISK_SER_2)
      .map(message => new String(message.getData(), StandardCharsets.UTF_8))
    // [END stream_setup]

    //process the stream
    processTrendingHashTags(messagesStream,
      windowLength.toInt,
      slidingInterval.toInt,
      10,
      //decoupled handler that saves each separate result for processed to datastore
      saveRDDtoDataStore(_, windowLength.toInt)
    )
    
	ssc
  }

  def main(args: Array[String]): Unit = {
    if (args.length != 5) {
      System.err.println(
        """
          | Usage: TrendingHashtags <projectID> <windowLength> <slidingInterval> <totalRunningTime>
          |
          |     <projectID>: ID of Google Cloud project
          |     <windowLength>: The duration of the window, in seconds
          |     <slidingInterval>: The interval at which the window calculation is performed, in seconds
          |     <totalRunningTime>: Total running time for the application, in minutes. If 0, runs indefinitely until termination.
          |     <checkpointDirectory>: Directory used to store RDD checkpoint data
          |
        """.stripMargin)
      System.exit(1)
    }

    val Seq(projectID, windowLength, slidingInterval, totalRunningTime, checkpointDirectory) = args.toSeq

    // Create Spark context
    val ssc = StreamingContext.getOrCreate(checkpointDirectory,
      () => createContext(projectID, windowLength, slidingInterval, checkpointDirectory))

    // Start streaming until we receive an explicit termination
    ssc.start()

    if (totalRunningTime.toInt == 0) {
      ssc.awaitTermination()
    }
    else {
      ssc.awaitTerminationOrTimeout(1000 * 60 * totalRunningTime.toInt)
    }
  }

}

Source File: HashTagsStreaming.scala From dataproc-pubsub-spark-streaming with Apache License 2.0

5 votes

package demo

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.dstream.DStream


object HashTagsStreaming {
  case class Popularity(tag: String, amount: Int)

  // [START extract]
  private[demo] def extractTrendingTags(input: RDD[String]): RDD[Popularity] =
    input.flatMap(_.split("\\s+")) // Split on any white character
      .filter(_.startsWith("#")) // Keep only the hashtags
      // Remove punctuation, force to lowercase
      .map(_.replaceAll("[,.!?:;]", "").toLowerCase)
      // Remove the first #
      .map(_.replaceFirst("^#", ""))
      .filter(!_.isEmpty) // Remove any non-words
      .map((_, 1)) // Create word count pairs
      .reduceByKey(_ + _) // Count occurrences
      .map(r => Popularity(r._1, r._2))
      // Sort hashtags by descending number of occurrences
      .sortBy(r => (-r.amount, r.tag), ascending = true)
  // [END extract]

  def processTrendingHashTags(input: DStream[String],
                              windowLength: Int,
                              slidingInterval: Int,
                              n: Int,
                              handler: Array[Popularity] => Unit): Unit = {
    val sortedHashtags: DStream[Popularity] = input
      .window(Seconds(windowLength), Seconds(slidingInterval)) //create a window
      .transform(extractTrendingTags(_)) //apply transformation

    sortedHashtags.foreachRDD(rdd => {
      handler(rdd.take(n)) //take top N hashtags and save to external source
    })
  }

}

Source File: package.scala From infinispan-spark with Apache License 2.0

5 votes

package org.infinispan.spark

import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd}
import org.apache.spark.streaming.dstream.DStream
import org.infinispan.client.hotrod.RemoteCacheManager
import org.infinispan.spark.config.ConnectorConfiguration
import org.infinispan.spark.rdd.RemoteCacheManagerBuilder

package object stream {

   implicit class InfinispanDStream[K, V](stream: DStream[(K, V)]) {

      private def getCacheManager(configuration: ConnectorConfiguration): RemoteCacheManager = {
         val rcm = RemoteCacheManagerBuilder.create(configuration)
         stream.context.sparkContext.addSparkListener(new SparkListener {
            override def onJobEnd(jobEnd: SparkListenerJobEnd) = rcm.stop()
         })
         rcm
      }

      def writeToInfinispan(configuration: ConnectorConfiguration) = {
         val rcm = getCacheManager(configuration)
         val cache = getCache(configuration, rcm)
         val topologyConfig = getCacheTopology(cache.getCacheTopologyInfo)
         configuration.setServerList(topologyConfig)
         stream.foreachRDD(_.writeToInfinispan(configuration))
      }
   }

}

Source File: DStreamKafkaWriter.scala From spark-kafka-writer with Apache License 2.0

5 votes

package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer.{Callback, ProducerRecord}
import org.apache.spark.streaming.dstream.DStream

import scala.reflect.ClassTag


  override def writeToKafka[K, V](
    producerConfig: Map[String, Object],
    transformFunc: T => ProducerRecord[K, V],
    callback: Option[Callback] = None
  ): Unit =
    dStream.foreachRDD { rdd =>
      val rddWriter = new RDDKafkaWriter[T](rdd)
      rddWriter.writeToKafka(producerConfig, transformFunc, callback)
    }
}

Source File: HttpInputDStreamAsync.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import scala.reflect.ClassTag

import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

import com.ning.http.client.AsyncCompletionHandler
import com.ning.http.client.AsyncHttpClient
import com.ning.http.client.Response

class HttpInputDStreamAsync(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiverAsync(storageLevel, url)
  }
}

class HttpReceiverAsync(
    storageLevel: StorageLevel,
    url: String) extends Receiver[String](storageLevel) with Logging {

  var asyncHttpClient: AsyncHttpClient = _

  def onStop() {
    asyncHttpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    asyncHttpClient = new AsyncHttpClient()
    asyncHttpClient.prepareGet(url).execute(new AsyncCompletionHandler[Response]() {

      override def onCompleted(response: Response): Response = {
        store(response.getResponseBody)
        return response
      }

      override def onThrowable(t: Throwable) {
        restart("Error! Problems while connecting", t)
      }
    });
    logInfo("Http Connection initiated")
  }
  
}

object HttpUtilsAsync {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String): DStream[String] = {
    new HttpInputDStreamAsync(ssc, storageLevel, url)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url)
  }
}

Source File: HivemallStreamingOps.scala From hivemall-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming

import scala.reflect.ClassTag

import org.apache.spark.ml.feature.HmLabeledPoint
import org.apache.spark.rdd.RDD
import org.apache.spark.sql.{Row, DataFrame, SQLContext}
import org.apache.spark.streaming.dstream.DStream

final class HivemallStreamingOps(ds: DStream[HmLabeledPoint]) {

  def predict[U: ClassTag](f: DataFrame => DataFrame)(implicit sqlContext: SQLContext)
      : DStream[Row] = {
    ds.transform[Row] { rdd: RDD[HmLabeledPoint] =>
      f(sqlContext.createDataFrame(rdd)).rdd
    }
  }
}

object HivemallStreamingOps {

  
  implicit def dataFrameToHivemallStreamingOps(ds: DStream[HmLabeledPoint])
      : HivemallStreamingOps = {
    new HivemallStreamingOps(ds)
  }
}

Source File: StreamingTask.scala From spark-cassandra-stress with Apache License 2.0

5 votes

package com.datastax.sparkstress

import java.util.concurrent.TimeUnit

import com.datastax.spark.connector.cql.CassandraConnector
import com.datastax.spark.connector.streaming._
import com.datastax.sparkstress.RowGenerator.PerfRowGenerator
import com.datastax.sparkstress.RowTypes._
import com.datastax.sparkstress.SparkStressImplicits._
import com.datastax.sparkstress.StressTask._
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{StreamingContext, _}

import scala.reflect.ClassTag

abstract class StreamingTask[rowType](
  val config: Config,
  val ss: SparkSession)
(implicit ct:ClassTag[rowType]) extends StressTask {

  val ssc = new StreamingContext(ss.sparkContext, Seconds(config.streamingBatchIntervalSeconds))
  val opsPerBatch = (config.numReceivers * config.receiverThroughputPerBatch)
  val estimatedReqRuntime: Long = ((config.totalOps / opsPerBatch) * config.streamingBatchIntervalSeconds) + 10
  val terminationTime: Long = {
    if (config.terminationTimeMinutes == 0) {
      estimatedReqRuntime
    } else {
      val newTerminationTime: Long = TimeUnit.MINUTES.toSeconds(config.terminationTimeMinutes)
      if (estimatedReqRuntime <= newTerminationTime) {
        println(s"Using the estimated runtime (${estimatedReqRuntime} secs}) required to stream ${config.totalOps} since it is <= the requested runtime (${newTerminationTime} secs).")
        estimatedReqRuntime
      } else {
        println(s"Converting requested runtime of ${config.terminationTimeMinutes} min to ${newTerminationTime} secs.")
        newTerminationTime
      }
    }
  }

  def setupCQL() = {
    val cc = CassandraConnector(ss.sparkContext.getConf)
    cc.withSessionDo { session =>
      if (config.deleteKeyspace) {
        println(s"Destroying Keyspace")
        session.execute(s"DROP KEYSPACE IF EXISTS ${config.keyspace}")
      }
      val kscql = getKeyspaceCql(config.keyspace, getLocalDC(cc), config.replicationFactor)
      val tbcql = getTableCql(config.table)
      println( s"""Running the following create statements\n$kscql\n${tbcql.mkString("\n")}""")
      session.execute(kscql)
      session.execute(s"USE ${config.keyspace}")
      for (cql <- tbcql)
        session.execute(cql)
    }
    printf("Done Setting up CQL Keyspace/Table\n")
  }

  def getTableCql(tbName: String): Seq[String]

  
  override def getGenerator: RowGenerator[PerfRowClass] = generator

  override def dstreamOps(dstream: DStream[PerfRowClass]): Unit = dstream.saveToCassandra(config.keyspace, config.table)
}

Source File: SparkStreamAdapterExample.scala From eventuate with Apache License 2.0

5 votes

package com.rbmhtechnology.example.spark

//#spark-stream-adapter
import com.rbmhtechnology.eventuate._
import com.rbmhtechnology.eventuate.adapter.spark.SparkStreamAdapter

import org.apache.spark._
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream

//#

import akka.actor._

import com.rbmhtechnology.eventuate.log.EventLogWriter
import com.rbmhtechnology.eventuate.log.leveldb.LeveldbEventLog

import scala.collection.immutable._
import scala.io.Source

object SparkStreamAdapterExample extends App {
  implicit val system: ActorSystem = ActorSystem(ReplicationConnection.DefaultRemoteSystemName)

  val logName: String = "L"
  val endpoint: ReplicationEndpoint = new ReplicationEndpoint(id = "1", logNames = Set(logName), logFactory = logId => LeveldbEventLog.props(logId), connections = Set())
  val log: ActorRef = endpoint.logs(logName)
  val writer: EventLogWriter = new EventLogWriter("writer", log)

  endpoint.activate()

  //#spark-stream-adapter
  val sparkConfig = new SparkConf(true)
    .setAppName("adapter")
    .setMaster("local[4]")
  val sparkContext = new SparkContext(sparkConfig)
  val sparkStreamingContext = new StreamingContext(sparkContext, Seconds(1))

  // Create an Eventuate Spark stream adapter
  val sparkStreamAdapter = new SparkStreamAdapter(
    sparkStreamingContext, system.settings.config)

  // Create a DStream from event log L by connecting to its replication endpoint
  val stream: DStream[DurableEvent] = sparkStreamAdapter.eventStream(
    id = "s1", host = "127.0.0.1", port = 2552, logName = "L",
    fromSequenceNr = 1L, storageLevel = StorageLevel.MEMORY_ONLY)

  // For processing in strict event storage order, use repartition(1)
  stream.repartition(1).foreachRDD(rdd => rdd.foreach(println))

  // Start event stream processing
  sparkStreamingContext.start()
  //#

  // Generate new events from stdin
  val lines = Source.stdin.getLines()
  def prompt(): Unit = {
    if (lines.hasNext) lines.next() match {
      case "exit" =>
        sparkStreamingContext.stop(stopSparkContext = true)
        system.terminate()
      case line =>
        writer.write(Seq(line))
        prompt()
    }
  }
  prompt()
}

Source File: MSNBCStreamingAdvanced.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingAdvanced extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }
    val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(trainSequences)
    val freqSequences = psModel.freqSequences.map(_.sequence).collect()


    val rawEvents: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val events: DStream[(Int, String)] = rawEvents.map(line => line.split(": "))
        .map(kv => (kv(0).toInt, kv(1)))

    val countIds = events.map(e => (e._1, 1))
    val counts: DStream[(Int, Int)] = countIds.reduceByKey(_ + _)

    def updateFunction(newValues: Seq[Int], runningCount: Option[Int]): Option[Int] = {
      Some(runningCount.getOrElse(0) + newValues.sum)
    }
    val runningCounts = countIds.updateStateByKey[Int](updateFunction _)

    val duration = Seconds(20)
    val slide = Seconds(10)

    val rawSequences: DStream[(Int, String)] = events
      .reduceByKeyAndWindow((v1: String, v2: String) => v1 + " " + v2, duration, slide)

    val sequences: DStream[Array[Array[Int]]] = rawSequences.map(_._2)
      .map(line => line.split(" ").map(_.toInt))
      .map(_.map(Array(_)))


    print(">>> Analysing new batch of data")
    sequences.foreachRDD(
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
          }
        }
      )
    )
    print(">>> done")

    ssc.start()
    ssc.awaitTermination()
}

Source File: MSNBCStreamingExample.scala From Mastering-Machine-Learning-with-Spark-2.x with MIT License

5 votes

package com.github.maxpumperla.ml_spark.streaming

import org.apache.spark.mllib.fpm.PrefixSpan
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.{SparkConf, SparkContext}

object MSNBCStreamingExample extends App {

    val conf = new SparkConf()
      .setAppName("MSNBC data initial streaming example")
      .setMaster("local[4]")
    val sc = new SparkContext(conf)
    val ssc = new StreamingContext(sc, batchDuration = Seconds(10))

    val transactions: RDD[Array[Int]] = sc.textFile("src/main/resources/msnbc990928.seq") map { line =>
      line.split(" ").map(_.toInt)
    }
    val trainSequences: RDD[Array[Array[Int]]] = transactions.map(_.map(Array(_))).cache()
    val prefixSpan = new PrefixSpan().setMinSupport(0.005).setMaxPatternLength(15)
    val psModel = prefixSpan.run(trainSequences)
    val freqSequences = psModel.freqSequences.map(_.sequence).collect()


    val rawSequences: DStream[String] = ssc.socketTextStream("localhost", 9999)

    val sequences: DStream[Array[Array[Int]]] = rawSequences
      .map(line => line.split(" ").map(_.toInt))
      .map(_.map(Array(_)))

    print(">>> Analysing new batch of data")
    sequences.foreachRDD(
      rdd => rdd.foreach(
        array => {
          println(">>> Sequence: ")
          println(array.map(_.mkString("[", ", ", "]")).mkString("[", ", ", "]"))
          freqSequences.count(_.deep == array.deep) match {
            case count if count > 0 => println("is frequent!")
            case _ => println("is not frequent.")
          }
        }
      )
    )
    print(">>> done")

    ssc.start()
    ssc.awaitTermination()

}

Source File: PointDStreamExtensionsSpec.scala From reactiveinflux-spark with Apache License 2.0

5 votes

package com.pygmalios.reactiveinflux.extensions

import com.holdenkarau.spark.testing.StreamingActionBase
import com.pygmalios.reactiveinflux.spark._
import com.pygmalios.reactiveinflux._
import org.apache.spark.streaming.dstream.DStream
import org.junit.runner.RunWith
import org.scalatest.BeforeAndAfterAll
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class PointDStreamExtensionsSpec extends StreamingActionBase
  with BeforeAndAfterAll {
  import PointRDDExtensionsSpec._

  override def beforeAll: Unit = {
    super.beforeAll
    withInflux(_.create())
  }

  override def afterAll: Unit = {
    withInflux(_.drop())
    super.afterAll
  }

  test("write single point to Influx") {
    val points = List(point1)

    // Execute
    runAction(Seq(points), (dstream: DStream[Point]) => dstream.saveToInflux())

    // Assert
    val result = withInflux(
      _.query(Query(s"SELECT * FROM $measurement1")).result.singleSeries)

    assert(result.rows.size == 1)

    val row = result.rows.head
    assert(row.time == point1.time)
    assert(row.values.size == 5)
  }
}

Source File: StreamingExample.scala From reactiveinflux-spark with Apache License 2.0

5 votes

package com.pygmalios.reactiveinflux.spark.examples

import com.pygmalios.reactiveinflux._
import com.pygmalios.reactiveinflux.spark._
import org.apache.spark.SparkConf
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.joda.time.DateTime

import scala.concurrent.duration._


object StreamingExample extends App {
  val conf = new SparkConf()
    .setMaster("local[*]")
    .setAppName("Example")
  val ssc = new StreamingContext(conf, Seconds(1))

  val point1 = Point(
    time        = DateTime.now(),
    measurement = "measurement1",
    tags        = Map(
      "tagKey1" -> "tagValue1",
      "tagKey2" -> "tagValue2"),
    fields      = Map(
      "fieldKey1" -> "fieldValue1",
      "fieldKey2" -> 10.7)
  )

  // Provide settings for reactiveinflux
  implicit val params = ReactiveInfluxDbName("example")
  implicit val awaitAtMost = 1.second

  // Create DStream of Influx points
  val queue = new scala.collection.mutable.Queue[RDD[Point]]
  val queueStream: DStream[Point] = ssc.queueStream(queue)

  // Add single RDD with a single Influx point to the DStream
  queue.enqueue(ssc.sparkContext.parallelize(Seq(point1)))

  // Save DStream to Influx
  queueStream.saveToInflux()

  // Start Spark streaming
  ssc.start()
  ssc.awaitTermination()
}

Source File: DStreamKafkaWriterSpec.scala From spark-kafka-writer with Apache License 2.0

5 votes

package com.github.benfradet.spark.kafka.writer

import org.apache.kafka.clients.producer._
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream

import scala.collection.mutable
import scala.concurrent.duration._

class DStreamKafkaWriterSpec extends SKRSpec {

  "a DStreamKafkaWriter" when {
    "given a dstream" should {
      "write its content to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
        stream.writeToKafka(
          producerConfig,
          s => new ProducerRecord[String, String](localTopic, s)
        )

        val results = collect(ssc, localTopic)

        ssc.start()
        eventually(timeout(30.seconds), interval(1.second)) {
          results shouldBe msgs
        }
      }

      "trigger a given callback for every write to Kafka" in {
        val localTopic = topic
        val msgs = (1 to 10).map(_.toString)
        val stream = createDStream(msgs)
        stream.writeToKafka(
          producerConfig,
          s => new ProducerRecord[String, String](localTopic, s),
          Some(new Callback with Serializable {
            override def onCompletion(metadata: RecordMetadata, exception: Exception): Unit = {
              SKRSpec.callbackTriggerCount.incrementAndGet()
            }
          })
        )

        ssc.start()
        eventually(timeout(30.seconds), interval(1.second)) {
          SKRSpec.callbackTriggerCount.get() shouldBe msgs.size
        }
      }
    }
  }

  private def createDStream(seq: Seq[String]): DStream[String] = {
    val q = mutable.Queue.empty[RDD[String]]
    q.enqueue(ssc.sparkContext.makeRDD(seq))
    ssc.queueStream(q)
  }
}

Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import java.util.Timer
import java.util.TimerTask

import scala.reflect.ClassTag

import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

class HttpInputDStream(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiver(storageLevel, url, interval)
  }
}

class HttpReceiver(
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends Receiver[String](storageLevel) with Logging {

  var httpClient: CloseableHttpClient = _
  var trigger: Timer = _

  def onStop() {
    httpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    httpClient = HttpClients.createDefault()
    trigger = new Timer()
    trigger.scheduleAtFixedRate(new TimerTask {
      def run() = doGet()
    }, 0, interval * 1000)

    logInfo("Http Receiver initiated")
  }

  def doGet() {
    logInfo("Fetching data from Http source")
    val response = httpClient.execute(new HttpGet(url))
    try {
      val content = EntityUtils.toString(response.getEntity())
      store(content)
    } catch {
      case e: Exception => restart("Error! Problems while connecting", e)
    } finally {
      response.close()
    }

  }

}

object HttpUtils {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String,
    interval: Long): DStream[String] = {
    new HttpInputDStream(ssc, storageLevel, url, interval)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url, interval)
  }
}

Source File: SolRSupport.scala From Taxi360 with Apache License 2.0

5 votes

package com.hadooparchitecturebook.taxi360.streaming.ingestion.solr

import java.net.{ConnectException, SocketException}
import java.util

import org.apache.solr.client.solrj.impl.CloudSolrServer
import org.apache.solr.client.solrj.request.UpdateRequest
import org.apache.solr.common.{SolrException, SolrInputDocument}
import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.DStream


object SolRSupport {
  def indexDStreamOfDocs(zkHost:String,
                         collection:String,
                         batchSize:Int,
                         docDStream:DStream[SolrInputDocument]): Unit ={
    docDStream.foreachRDD(docRdd => {
      indexDoc(zkHost, collection, batchSize, docRdd)
    })
  }

  def indexDoc(zkHost:String,
               collection:String,
               batchSize:Int,
               docRdd:RDD[SolrInputDocument]): Unit = {
    docRdd.foreachPartition(it => {
      val solrServer = CloudSolRServerBuilder.build(zkHost)

      val batch = new util.ArrayList[SolrInputDocument]()

      while (it.hasNext) {
        val inputDoc = it.next()
        batch.add(inputDoc)
        if (batch.size() >= batchSize)
          sendBatchToSolr(solrServer, collection, batch)
      }
      if (!batch.isEmpty())
        sendBatchToSolr(solrServer, collection, batch)
    })
  }

  def sendBatchToSolr( solrServer: CloudSolrServer,
                       collection:String,
                       batch:util.Collection[SolrInputDocument]) {
    val req = new UpdateRequest()
    req.setParam("collection", collection)

    req.add(batch)
    try {
      solrServer.request(req)
    } catch  {
      case e:Exception => {
        if (shouldRetry(e)) {
          try {
            Thread.sleep(2000)
          } catch {
            case e1: InterruptedException => {
              Thread.interrupted()
            }
          }

          try {
            solrServer.request(req)
          } catch {
            case e1: Exception => {

              if (e1.isInstanceOf[RuntimeException]) {
                throw e1.asInstanceOf[RuntimeException]
              } else {
                throw new RuntimeException(e1)
              }
            }
          }
        } else {
          if (e.isInstanceOf[RuntimeException]) {
            throw e.asInstanceOf[RuntimeException]
          } else {
            throw new RuntimeException(e)
          }
        }
      }
    } finally {
      batch.clear()
    }
  }

  def shouldRetry( exc:Exception): Boolean = {
    val rootCause = SolrException.getRootCause(exc)
    rootCause.isInstanceOf[ConnectException] ||
      rootCause.isInstanceOf[SocketException]
  }
}

Source File: StreamingUnitTest.scala From SparkUnitTestingExamples with Apache License 2.0

5 votes

package com.cloudera.sa.spark.unittest.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming._
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.{SparkConf, SparkContext}
import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite}

import scala.collection.mutable.Queue

class StreamingUnitTest extends FunSuite with
BeforeAndAfterEach with BeforeAndAfterAll{

  @transient var sc: SparkContext = null
  @transient var ssc: StreamingContext = null

  override def beforeAll(): Unit = {

    val envMap = Map[String,String](("Xmx", "512m"))

    val sparkConfig = new SparkConf()
    sparkConfig.set("spark.broadcast.compress", "false")
    sparkConfig.set("spark.shuffle.compress", "false")
    sparkConfig.set("spark.shuffle.spill.compress", "false")
    sparkConfig.set("spark.io.compression.codec", "lzf")
    sc = new SparkContext("local[2]", "unit test", sparkConfig)
    ssc = new StreamingContext(sc, Milliseconds(200))
  }

  override def afterAll(): Unit = {
    sc.stop()
  }

  test("Streaming word count") {

    val firstBatchRDD = sc.parallelize(Seq("a", "b", "c"))
    val secondBatchRDD = sc.parallelize(Seq("a", "e"))
    val thirdBatchRDD = sc.parallelize(Seq("b", "c", "e", "f"))
    val forthBatchRDD = sc.parallelize(Seq("a", "e"))

    val queue = new Queue[RDD[String]]

    queue.+=(firstBatchRDD)
    queue.+=(secondBatchRDD)
    queue.+=(thirdBatchRDD)
    queue.+=(forthBatchRDD)

    println(queue)

    val startTime = System.currentTimeMillis()

    val dstream = new TestableQueueInputDStream(ssc, queue, true, sc.makeRDD(Seq[String](), 1))
    //ssc.queueStream(queue)

    dstream.checkpoint(Seconds(100))

    val batchTotals:DStream[(String, Int)] = dstream.map(r => (r, 1)).reduceByKey(_ + _)

    val streamTotals = batchTotals.updateStateByKey(
      (seq:Seq[Int], opt:Option[Int]) => {
        if (!seq.isEmpty) {
          val totalCountForNew = seq.reduce(_ + _)
          if (opt.isEmpty) {
            Option(totalCountForNew)
          } else {
            Option(opt.get + totalCountForNew)
          }
        } else {
          opt
        }
    })

    streamTotals.foreachRDD(rdd => {

    })

    ssc.checkpoint("./tmp")
    ssc.start()
    ssc.awaitTerminationOrTimeout(2000)

    val endTime = System.currentTimeMillis()

    val rddList = streamTotals.slice(new Time(startTime), new Time(endTime))

    rddList(0).collect().foreach(println)
    assert(rddList(0).collect().filter(r => r._1.equals("a"))(0)._2 == 1)
    rddList(1).collect().foreach(println)
    assert(rddList(1).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    rddList(2).collect().foreach(println)
    assert(rddList(2).collect().filter(r => r._1.equals("a"))(0)._2  == 2)
    rddList(3).collect().foreach(println)
    assert(rddList(3).collect().filter(r => r._1.equals("a"))(0)._2  == 3)
  }
}

Source File: DStreamFunctions.scala From spark-riak-connector with Apache License 2.0

5 votes

package com.basho.riak.spark.streaming

import com.basho.riak.spark.rdd.BucketDef
import com.basho.riak.spark.rdd.connector.RiakConnector
import com.basho.riak.spark.writer.{RiakWriter, WritableToRiak, WriteConf, WriteDataMapperFactory}
import com.basho.riak.spark.writer.ts.RowDef
import org.apache.spark.SparkContext
import org.apache.spark.sql.Row
import org.apache.spark.streaming.dstream.DStream

class DStreamFunctions[T](dstream: DStream[T]) extends WritableToRiak[T] {
  override def sparkContext: SparkContext = dstream.context.sparkContext

  override def saveToRiak(bucketName: String,
                          bucketType: String = BucketDef.DefaultBucketType,
                          writeConf: WriteConf = WriteConf(sparkContext.getConf)
                         )(implicit
                           connector: RiakConnector,
                           vwf: WriteDataMapperFactory[T, (String, Any)]
                         ): Unit = {
    val writer = RiakWriter[T](connector, bucketType, bucketName, writeConf)
    dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, writer.write _))
  }

  override def saveToRiakTS(bucketName: String,
                            bucketType: String = BucketDef.DefaultBucketType,
                            writeConf: WriteConf = WriteConf(sparkContext.getConf)
                           )(implicit evidence: <:<[T, Row],
                             connector: RiakConnector,
                             vwf: WriteDataMapperFactory[T, RowDef]
                           ): Unit = {
    val tsWriter = RiakWriter.tsWriter[T](connector, bucketType, bucketName, writeConf)
    dstream.foreachRDD(rdd => rdd.sparkContext.runJob(rdd, tsWriter.write _))
  }
}

Source File: SocketTextStream.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.sql.SparkSession
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.dstream.{DStream, InputDStream, ReceiverInputDStream, SocketReceiver}
import org.apache.spark.streaming.{Seconds, StreamingContext}

class SocketTextStream extends ConfigurableStreamingStop {
  override val authorEmail: String = "[email protected]"
  override val description: String = "Receive text data from socket"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override var batchDuration: Int = _

  var hostname:String =_
  var port:String=_
  //var schema:String=_

  override def setProperties(map: Map[String, Any]): Unit = {
    hostname=MapUtil.get(map,key="hostname").asInstanceOf[String]
    port=MapUtil.get(map,key="port").asInstanceOf[String]
    //schema=MapUtil.get(map,key="schema").asInstanceOf[String]
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data").defaultValue("").required(true)
    val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true)
    //val schema = new PropertyDescriptor().name("schema").displayName("schema").description("data schema").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = hostname :: descriptor
    descriptor = port :: descriptor
    //descriptor = schema :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  //TODO: change icon
  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/SocketTextStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {

    val spark = pec.get[SparkSession]();
    val socketDF = spark
      .readStream
      .format("socket")
      .option("host",hostname)
      .option("port",port)
      .load()

    out.write(socketDF)
  }



  
  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port))
    dstream.asInstanceOf[DStream[String]]
  }

}

Source File: KafkaStream.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.kafka.common.serialization.StringDeserializer
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.kafka010.KafkaUtils
import org.apache.spark.streaming.kafka010.LocationStrategies.PreferConsistent
import org.apache.spark.streaming.kafka010.ConsumerStrategies.Subscribe


class KafkaStream extends ConfigurableStreamingStop{
  override var batchDuration: Int = _
  override val authorEmail: String = "[email protected]"
  override val description: String = "Read data from kafka"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var brokers:String = _
  var groupId:String = _
  var topics:Array[String] = _
  override def setProperties(map: Map[String, Any]): Unit = {
    brokers=MapUtil.get(map,key="brokers").asInstanceOf[String]
    groupId=MapUtil.get(map,key="groupId").asInstanceOf[String]
    topics=MapUtil.get(map,key="topics").asInstanceOf[String].split(",").map(x => x.trim)
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val brokers = new PropertyDescriptor().name("brokers").displayName("brokers").description("kafka brokers, seperated by ','").defaultValue("").required(true)
    val groupId = new PropertyDescriptor().name("groupId").displayName("groupId").description("kafka consumer group").defaultValue("group").required(true)
    val topics = new PropertyDescriptor().name("topics").displayName("topics").description("kafka topics").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = brokers :: descriptor
    descriptor = groupId :: descriptor
    descriptor = topics :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/KafkaStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val kafkaParams = Map[String, Object](
      "bootstrap.servers" -> brokers,
      "key.deserializer" -> classOf[StringDeserializer],
      "value.deserializer" -> classOf[StringDeserializer],
      "group.id" -> groupId,
      "auto.offset.reset" -> "latest",
      "enable.auto.commit" -> (false:java.lang.Boolean)
    )
    val stream = KafkaUtils.createDirectStream[String,String](
      ssc,
      PreferConsistent,
      Subscribe[String, String](topics, kafkaParams)
    )
    stream.map(record => record.key() + "," + record.value())
    //stream.asInstanceOf[DStream[ConsumerRecord]]

  }

  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {}
}

Source File: TextFileStream.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream

class TextFileStream extends ConfigurableStreamingStop{
  override var batchDuration: Int = _
  override val authorEmail: String = "[email protected]"
  override val description: String = "Get text file streaming data"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var directory:String =_

  override def setProperties(map: Map[String, Any]): Unit = {
    directory=MapUtil.get(map,key="directory").asInstanceOf[String]
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val directory = new PropertyDescriptor().name("directory").displayName("directory").description("HDFS directory to monitor for new file. Files must be written to the monitored directory by \"moving\" them from another location within the same file system ").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = directory :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  //TODO: change icon
  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/TextFileStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val dstream = ssc.textFileStream(directory)
    dstream
  }

  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {}
}

Source File: SocketTextStreamByWindow.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.streaming

import cn.piflow.conf._
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import org.apache.spark.streaming.{Seconds, StreamingContext}
import org.apache.spark.streaming.dstream.DStream

class SocketTextStreamByWindow extends ConfigurableStreamingStop {
  override val authorEmail: String = "[email protected]"
  override val description: String = "Receive text data from socket by window"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)
  override var batchDuration: Int = _

  var hostname:String =_
  var port:String=_
  var windowDuration:Int = _
  var slideDuration:Int = _

  override def setProperties(map: Map[String, Any]): Unit = {
    hostname=MapUtil.get(map,key="hostname").asInstanceOf[String]
    port=MapUtil.get(map,key="port").asInstanceOf[String]
    windowDuration=MapUtil.get(map,key="windowDuration").asInstanceOf[String].toInt
    slideDuration=MapUtil.get(map,key="slideDuration").asInstanceOf[String].toInt
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("Hostname to connect to for receiving data ").defaultValue("").required(true)
    val port = new PropertyDescriptor().name("port").displayName("port").description("Port to connect to for receiving data").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    val windowDuration = new PropertyDescriptor().name("windowDuration").displayName("windowDuration").description("the window duration, the unit is seconds").defaultValue("").required(true)
    val slideDuration = new PropertyDescriptor().name("slideDuration").displayName("slideDuration").description("the slide duration, the unit is seconds").defaultValue("").required(true)
    descriptor = hostname :: descriptor
    descriptor = port :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor = windowDuration :: descriptor
    descriptor = slideDuration :: descriptor
    descriptor
  }

  //TODO: change icon
  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/SocketTextStreamByWindow.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def initialize(ctx: ProcessContext): Unit = {

  }

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {


  }

  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val dstream = ssc.socketTextStream(hostname,Integer.parseInt(port))
    dstream.window(Seconds(windowDuration),Seconds(slideDuration))
    //dstream.reduceByWindow(_ + _,Seconds(windowDuration),Seconds(slideDuration))
  }

}

Source File: FlumeStream.scala From piflow with BSD 2-Clause "Simplified" License

5 votes

package cn.piflow.bundle.streaming

import cn.piflow.{JobContext, JobInputStream, JobOutputStream, ProcessContext}
import cn.piflow.conf.{ConfigurableStreamingStop, Port, StopGroup}
import cn.piflow.conf.bean.PropertyDescriptor
import cn.piflow.conf.util.{ImageUtil, MapUtil}
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.flume._

class FlumeStream extends ConfigurableStreamingStop{
  override var batchDuration: Int = _
  override val authorEmail: String = "[email protected]"
  override val description: String = "Get data from flume"
  override val inportList: List[String] = List(Port.DefaultPort)
  override val outportList: List[String] = List(Port.DefaultPort)

  var hostname:String =_
  var port:Int=_

  override def setProperties(map: Map[String, Any]): Unit = {
    hostname=MapUtil.get(map,key="hostname").asInstanceOf[String]
    port=MapUtil.get(map,key="port").asInstanceOf[String].toInt
    val timing = MapUtil.get(map,key="batchDuration")
    batchDuration=if(timing == None) new Integer(1) else timing.asInstanceOf[String].toInt
  }

  override def getPropertyDescriptor(): List[PropertyDescriptor] = {
    var descriptor : List[PropertyDescriptor] = List()
    val hostname = new PropertyDescriptor().name("hostname").displayName("hostname").description("hostname of the slave machine to which the flume data will be sent, the hostName must be one of the cluster worker node").defaultValue("").required(true)
    val port = new PropertyDescriptor().name("port").displayName("port").description("Port of the slave machine to which the flume data will be sent, the port should be greater than 10000").defaultValue("").required(true)
    val batchDuration = new PropertyDescriptor().name("batchDuration").displayName("batchDuration").description("the streaming batch duration").defaultValue("1").required(true)
    descriptor = hostname :: descriptor
    descriptor = port :: descriptor
    descriptor = batchDuration :: descriptor
    descriptor
  }

  override def getIcon(): Array[Byte] = {
    ImageUtil.getImage("icon/streaming/FlumeStream.png")
  }

  override def getGroup(): List[String] = {
    List(StopGroup.StreamingGroup)
  }

  override def getDStream(ssc: StreamingContext): DStream[String] = {
    val flumeStream = FlumeUtils.createStream(ssc, hostname, port)
    flumeStream.map(e => new String(e.event.getBody.array(), "UTF-8"))
  }

  override def initialize(ctx: ProcessContext): Unit = {}

  override def perform(in: JobInputStream, out: JobOutputStream, pec: JobContext): Unit = {}
}

Source File: TestOutputStream.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming

import java.io.{IOException, ObjectInputStream}
import java.util.concurrent.ConcurrentLinkedQueue

import scala.reflect.ClassTag

import org.apache.spark.rdd.RDD
import org.apache.spark.streaming.dstream.{DStream, ForEachDStream}
import org.apache.spark.util.Utils


class TestOutputStream[T: ClassTag](parent: DStream[T],
    val output: ConcurrentLinkedQueue[Seq[T]] = new ConcurrentLinkedQueue[Seq[T]]())
  extends ForEachDStream[T](parent, (rdd: RDD[T], t: Time) => {
    val collected = rdd.collect()
    output.add(collected)
  }, false) {

  // This is to clear the output buffer every it is read from a checkpoint
  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    ois.defaultReadObject()
    output.clear()
  }
}

Source File: CubeWriterHelper.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.writer

import java.sql.{Date, Timestamp}

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.driver.step.Cube
import com.stratio.sparta.sdk.pipeline.aggregation.cube.{DimensionValue, DimensionValuesTime, MeasuresValues}
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import org.apache.spark.sql._
import org.apache.spark.streaming.dstream.DStream

object CubeWriterHelper extends SLF4JLogging {

  def writeCube(cube: Cube, outputs: Seq[Output], stream: DStream[(DimensionValuesTime, MeasuresValues)]): Unit = {
    stream.map { case (dimensionValuesTime, measuresValues) =>
      toRow(cube, dimensionValuesTime, measuresValues)
    }.foreachRDD(rdd => {
      if (!rdd.isEmpty()) {
        val sparkSession = SparkContextFactory.sparkSessionInstance
        val cubeDf = sparkSession.createDataFrame(rdd, cube.schema)
        val extraOptions = Map(Output.TableNameKey -> cube.name)
        val cubeAutoCalculatedFieldsDf = WriterHelper.write(cubeDf, cube.writerOptions, extraOptions, outputs)

        TriggerWriterHelper.writeTriggers(cubeAutoCalculatedFieldsDf, cube.triggers, cube.name, outputs)
      } else log.debug("Empty event received")
    })
  }

  private[driver] def toRow(cube: Cube, dimensionValuesT: DimensionValuesTime, measures: MeasuresValues): Row = {
    val measuresSorted = measuresValuesSorted(measures.values)
    val rowValues = dimensionValuesT.timeConfig match {
      case None =>
        val dimensionValues = dimensionsValuesSorted(dimensionValuesT.dimensionValues)

        dimensionValues ++ measuresSorted
      case Some(timeConfig) =>
        val timeValue = Seq(timeFromDateType(timeConfig.eventTime, cube.dateType))
        val dimFilteredByTime = filterDimensionsByTime(dimensionValuesT.dimensionValues, timeConfig.timeDimension)
        val dimensionValues = dimensionsValuesSorted(dimFilteredByTime) ++ timeValue
        val measuresValuesWithTime = measuresSorted

        dimensionValues ++ measuresValuesWithTime
    }

    Row.fromSeq(rowValues)
  }

  private[driver] def dimensionsValuesSorted(dimensionValues: Seq[DimensionValue]): Seq[Any] =
    dimensionValues.sorted.map(dimVal => dimVal.value)

  private[driver] def measuresValuesSorted(measures: Map[String, Option[Any]]): Seq[Any] =
    measures.toSeq.sortWith(_._1 < _._1).map(measure => measure._2.getOrElse(null))

  private[driver] def filterDimensionsByTime(dimensionValues: Seq[DimensionValue],
                                             timeDimension: String): Seq[DimensionValue] =
    dimensionValues.filter(dimensionValue => dimensionValue.dimension.name != timeDimension)

  private[driver] def timeFromDateType(time: Long, dateType: TypeOp.Value): Any = {
    dateType match {
      case TypeOp.Date | TypeOp.DateTime => new Date(time)
      case TypeOp.Long => time
      case TypeOp.Timestamp => new Timestamp(time)
      case _ => time.toString
    }
  }
}

Source File: TwitterJsonInput.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.input.twitter

import java.io.{Serializable => JSerializable}

import com.google.gson.Gson
import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.twitter.TwitterUtils
import twitter4j.TwitterFactory
import twitter4j.conf.ConfigurationBuilder

import scala.util.{Failure, Success, Try}


class TwitterJsonInput(properties: Map[String, JSerializable]) extends Input(properties) {

  System.setProperty("twitter4j.oauth.consumerKey", properties.getString("consumerKey"))
  System.setProperty("twitter4j.oauth.consumerSecret", properties.getString("consumerSecret"))
  System.setProperty("twitter4j.oauth.accessToken", properties.getString("accessToken"))
  System.setProperty("twitter4j.oauth.accessTokenSecret", properties.getString("accessTokenSecret"))

  val cb = new ConfigurationBuilder()
  val tf = new TwitterFactory(cb.build())
  val twitterApi = tf.getInstance()
  val trends = twitterApi.getPlaceTrends(1).getTrends.map(trend => trend.getName)
  val terms: Option[Seq[String]] = Try(properties.getString("termsOfSearch")) match {
    case Success("") => None
    case Success(t: String) => Some(t.split(",").toSeq)
    case Failure(_) => None
  }
  val search = terms.getOrElse(trends.toSeq)

  def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = {
    TwitterUtils.createStream(ssc, None, search, storageLevel(sparkStorageLevel))
      .map(stream => {
        val gson = new Gson()
        Row(gson.toJson(stream))
      }
      )
  }
}

Source File: SocketInput.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.plugin.input.socket

import java.io.{Serializable => JSerializable}

import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.sdk.properties.ValidatingPropertyMap._
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream


class SocketInput(properties: Map[String, JSerializable]) extends Input(properties) {

  private val hostname : String = properties.getString("hostname")
  private val port : Int = properties.getInt("port")

  def initStream(ssc: StreamingContext, sparkStorageLevel: String): DStream[Row] = {
    ssc.socketTextStream(
      hostname,
      port,
      storageLevel(sparkStorageLevel))
      .map(data => Row(data))
  }
}

Source File: InputStage.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.stage

import com.stratio.sparta.sdk.pipeline.input.Input
import com.stratio.sparta.serving.core.constants.AppConstant
import com.stratio.sparta.serving.core.models.policy.PhaseEnum
import com.stratio.sparta.serving.core.utils.ReflectionUtils
import org.apache.spark.sql.Row
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream

trait InputStage extends BaseStage {
  this: ErrorPersistor =>

  def inputStreamStage(ssc: StreamingContext, input: Input): DStream[Row] = {
    val errorMessage = s"Something gone wrong creating the input stream for: ${policy.input.get.name}."
    val okMessage = s"Stream for Input: ${policy.input.get.name} created correctly."

    generalTransformation(PhaseEnum.InputStream, okMessage, errorMessage) {
      require(policy.storageLevel.isDefined, "You need to define the storage level")
      input.initStream(ssc, policy.storageLevel.get)
    }
  }

  def createInput(ssc: StreamingContext, refUtils: ReflectionUtils): Input = {
    val errorMessage = s"Something gone wrong creating the input: ${policy.input.get.name}. Please re-check the policy."
    val okMessage = s"Input: ${policy.input.get.name} created correctly."

    generalTransformation(PhaseEnum.Input, okMessage, errorMessage) {
      require(policy.input.isDefined, "You need at least one input in your policy")
      val classType =
        policy.input.get.configuration.getOrElse(AppConstant.CustomTypeKey, policy.input.get.`type`).toString
      refUtils.tryToInstantiate[Input](classType + Input.ClassSuffix, (c) =>
        refUtils.instantiateParameterizable[Input](c, policy.input.get.configuration))
    }
  }


}

Source File: RawDataStage.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.stage

import com.stratio.sparta.driver.step.RawData
import com.stratio.sparta.driver.writer.{RawDataWriterHelper, WriterOptions}
import com.stratio.sparta.sdk.pipeline.output.{Output, SaveModeEnum}
import com.stratio.sparta.serving.core.models.policy.{PhaseEnum, RawDataModel}
import org.apache.spark.sql.Row
import org.apache.spark.streaming.dstream.DStream

trait RawDataStage extends BaseStage {
  this: ErrorPersistor =>

  def saveRawData(rawModel: Option[RawDataModel], input: DStream[Row], outputs: Seq[Output]): Unit =
    if (rawModel.isDefined) {
      val rawData = rawDataStage()

      RawDataWriterHelper.writeRawData(rawData, outputs, input)
    }

  private[driver] def rawDataStage(): RawData = {
    val errorMessage = s"Something gone wrong saving the raw data. Please re-check the policy."
    val okMessage = s"RawData: created correctly."

    generalTransformation(PhaseEnum.RawData, okMessage, errorMessage) {
      require(policy.rawData.isDefined, "You need a raw data stage defined in your policy")
      require(policy.rawData.get.writer.tableName.isDefined, "You need a table name defined in your raw data stage")

      createRawData(policy.rawData.get)
    }
  }

  private[driver] def createRawData(rawDataModel: RawDataModel): RawData = {
    val okMessage = s"RawData created correctly."
    val errorMessage = s"Something gone wrong creating the RawData. Please re-check the policy."
    generalTransformation(PhaseEnum.RawData, okMessage, errorMessage) {
      RawData(
        rawDataModel.dataField,
        rawDataModel.timeField,
        WriterOptions(
          rawDataModel.writer.outputs,
          SaveModeEnum.Append,
          rawDataModel.writer.tableName,
          getAutoCalculatedFields(rawDataModel.writer.autoCalculatedFields),
          rawDataModel.writer.partitionBy,
          rawDataModel.writer.primaryKey
        ),
        rawDataModel.configuration)
    }
  }
}

Source File: ParserStage.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.stage

import java.io.Serializable

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.driver.writer.{TransformationsWriterHelper, WriterOptions}
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.pipeline.transformation.Parser
import com.stratio.sparta.serving.core.constants.AppConstant
import com.stratio.sparta.serving.core.models.policy.{PhaseEnum, TransformationModel}
import com.stratio.sparta.serving.core.utils.ReflectionUtils
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

import scala.util.{Failure, Success, Try}

trait ParserStage extends BaseStage {
  this: ErrorPersistor =>

  def parserStage(refUtils: ReflectionUtils,
                  schemas: Map[String, StructType]): (Seq[Parser], Option[WriterOptions]) =
    (policy.transformations.get.transformationsPipe.map(parser => createParser(parser, refUtils, schemas)),
      policy.transformations.get.writer.map(writer => WriterOptions(
        writer.outputs,
        writer.saveMode,
        writer.tableName,
        getAutoCalculatedFields(writer.autoCalculatedFields),
        writer.partitionBy,
        writer.primaryKey
      )))

  private[driver] def createParser(model: TransformationModel,
                           refUtils: ReflectionUtils,
                           schemas: Map[String, StructType]): Parser = {
    val classType = model.configuration.getOrElse(AppConstant.CustomTypeKey, model.`type`).toString
    val errorMessage = s"Something gone wrong creating the parser: $classType. Please re-check the policy."
    val okMessage = s"Parser: $classType created correctly."
    generalTransformation(PhaseEnum.Parser, okMessage, errorMessage) {
      val outputFieldsNames = model.outputFieldsTransformed.map(_.name)
      val schema = schemas.getOrElse(model.order.toString, throw new Exception("Can not find transformation schema"))
      refUtils.tryToInstantiate[Parser](classType + Parser.ClassSuffix, (c) =>
        c.getDeclaredConstructor(
          classOf[Integer],
          classOf[Option[String]],
          classOf[Seq[String]],
          classOf[StructType],
          classOf[Map[String, Serializable]])
          .newInstance(model.order, model.inputField, outputFieldsNames, schema, model.configuration)
          .asInstanceOf[Parser])
    }
  }
}

object ParserStage extends SLF4JLogging {

  def executeParsers(row: Row, parsers: Seq[Parser]): Seq[Row] =
    if (parsers.size == 1) parseEvent(row, parsers.head)
    else parseEvent(row, parsers.head).flatMap(eventParsed => executeParsers(eventParsed, parsers.drop(1)))

  def parseEvent(row: Row, parser: Parser): Seq[Row] =
    Try {
      parser.parse(row)
    } match {
      case Success(eventParsed) =>
        eventParsed
      case Failure(exception) =>
        val error = s"Failure[Parser]: ${row.mkString(",")} | Message: ${exception.getLocalizedMessage}" +
          s" | Parser: ${parser.getClass.getSimpleName}"
        log.error(error, exception)
        Seq.empty[Row]
    }

  def applyParsers(input: DStream[Row],
                   parsers: Seq[Parser],
                   schema: StructType,
                   outputs: Seq[Output],
                   writerOptions: Option[WriterOptions]): DStream[Row] = {
    val transformedData = if (parsers.isEmpty) input
    else input.flatMap(row => executeParsers(row, parsers))

    writerOptions.foreach(options =>
      TransformationsWriterHelper.writeTransformations(transformedData, schema, outputs, options))
    transformedData
  }
}

Source File: TriggerStage.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.stage

import com.stratio.sparta.driver.step.Trigger
import com.stratio.sparta.driver.writer.{TriggerWriterHelper, WriterOptions}
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.utils.AggregationTime
import com.stratio.sparta.serving.core.models.policy.PhaseEnum
import com.stratio.sparta.serving.core.models.policy.trigger.TriggerModel
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.Milliseconds
import org.apache.spark.streaming.dstream.DStream

trait TriggerStage extends BaseStage {
  this: ErrorPersistor =>

  def triggersStreamStage(initSchema: StructType,
                          inputData: DStream[Row],
                          outputs: Seq[Output],
                          window: Long): Unit = {
    val triggersStage = triggerStage(policy.streamTriggers)
    val errorMessage = s"Something gone wrong executing the triggers stream for: ${policy.input.get.name}."
    val okMessage = s"Triggers Stream executed correctly."
    generalTransformation(PhaseEnum.TriggerStream, okMessage, errorMessage) {
      triggersStage
        .groupBy(trigger => (trigger.overLast, trigger.computeEvery))
        .foreach { case ((overLast, computeEvery), triggers) =>
          val groupedData = (overLast, computeEvery) match {
            case (None, None) => inputData
            case (Some(overL), Some(computeE))
              if (AggregationTime.parseValueToMilliSeconds(overL) == window) &&
                (AggregationTime.parseValueToMilliSeconds(computeE) == window) => inputData
            case _ => inputData.window(
              Milliseconds(
                overLast.fold(window) { over => AggregationTime.parseValueToMilliSeconds(over) }),
              Milliseconds(
                computeEvery.fold(window) { computeEvery => AggregationTime.parseValueToMilliSeconds(computeEvery) }))
          }
          TriggerWriterHelper.writeStream(triggers, streamTemporalTable(policy.streamTemporalTable), outputs,
            groupedData, initSchema)
        }
    }
  }

  def triggerStage(triggers: Seq[TriggerModel]): Seq[Trigger] =
    triggers.map(trigger => createTrigger(trigger))

  private[driver] def createTrigger(trigger: TriggerModel): Trigger = {
    val okMessage = s"Trigger: ${trigger.name} created correctly."
    val errorMessage = s"Something gone wrong creating the trigger: ${trigger.name}. Please re-check the policy."
    generalTransformation(PhaseEnum.Trigger, okMessage, errorMessage) {
      Trigger(
        trigger.name,
        trigger.sql,
        trigger.overLast,
        trigger.computeEvery,
        WriterOptions(
          trigger.writer.outputs,
          trigger.writer.saveMode,
          trigger.writer.tableName,
          getAutoCalculatedFields(trigger.writer.autoCalculatedFields),
          trigger.writer.primaryKey,
          trigger.writer.partitionBy
        ),
        trigger.configuration)
    }
  }

  private[driver] def streamTemporalTable(policyTableName: Option[String]): String =
    policyTableName.flatMap(tableName => if (tableName.nonEmpty) Some(tableName) else None)
      .getOrElse("stream")

}

Source File: CubeOperations.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.step

import akka.event.slf4j.SLF4JLogging
import com.stratio.sparta.sdk.pipeline.aggregation.cube.{DimensionValue, DimensionValuesTime, InputFields, TimeConfig}
import com.stratio.sparta.sdk.pipeline.schema.TypeOp
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.streaming.dstream.DStream
import org.joda.time.DateTime

import scala.util.{Failure, Success, Try}


  def extractDimensionsAggregations(inputStream: DStream[Row]): DStream[(DimensionValuesTime, InputFields)] = {
    inputStream.mapPartitions(rows => rows.flatMap(row => Try {
      val dimensionValues = for {
        dimension <- cube.dimensions
        value = row.get(cube.initSchema.fieldIndex(dimension.field))
        (precision, dimValue) = dimension.dimensionType.precisionValue(dimension.precisionKey, value)
      } yield DimensionValue(dimension, TypeOp.transformValueByTypeOp(precision.typeOp, dimValue))

      cube.expiringDataConfig match {
        case None =>
          (DimensionValuesTime(cube.name, dimensionValues), InputFields(row, UpdatedValues))
        case Some(expiringDataConfig) =>
          val eventTime = extractEventTime(dimensionValues)
          val timeDimension = expiringDataConfig.timeDimension
          (DimensionValuesTime(cube.name, dimensionValues, Option(TimeConfig(eventTime, timeDimension))),
            InputFields(row, UpdatedValues))
      }
    } match {
      case Success(dimensionValuesTime) =>
        Some(dimensionValuesTime)
      case Failure(exception) =>
        val error = s"Failure[Aggregations]: ${row.toString} | ${exception.getLocalizedMessage}"
        log.error(error, exception)
        None
    }), true)
  }

  private[driver] def extractEventTime(dimensionValues: Seq[DimensionValue]) = {

    val timeDimension = cube.expiringDataConfig.get.timeDimension
    val dimensionsDates =
      dimensionValues.filter(dimensionValue => dimensionValue.dimension.name == timeDimension)

    if (dimensionsDates.isEmpty) getDate
    else AggregationTime.getMillisFromSerializable(dimensionsDates.head.value)
  }

  private[driver] def getDate: Long = {
    val checkpointGranularity = cube.expiringDataConfig.get.granularity

    AggregationTime.truncateDate(DateTime.now(), checkpointGranularity)
  }
}

Source File: TriggerWriterHelper.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.writer

import akka.event.slf4j.SLF4JLogging
import org.apache.spark.sql.{DataFrame, Row}
import com.stratio.sparta.driver.exception.DriverException
import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.driver.schema.SchemaHelper
import com.stratio.sparta.driver.step.Trigger
import com.stratio.sparta.sdk.pipeline.output.Output
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

import scala.util.{Failure, Success, Try}

object TriggerWriterHelper extends SLF4JLogging {

  def writeStream(triggers: Seq[Trigger],
                  inputTableName: String,
                  outputs: Seq[Output],
                  streamData: DStream[Row],
                  schema: StructType): Unit = {
    streamData.foreachRDD(rdd => {
      val parsedDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, schema)

      writeTriggers(parsedDataFrame, triggers, inputTableName, outputs)
    })
  }

  //scalastyle:off
  def writeTriggers(dataFrame: DataFrame,
                    triggers: Seq[Trigger],
                    inputTableName: String,
                    outputs: Seq[Output]): Unit = {
    val sparkSession = dataFrame.sparkSession
    if (triggers.nonEmpty && isCorrectTableName(inputTableName)) {
      if (!sparkSession.catalog.tableExists(inputTableName)) {
        dataFrame.createOrReplaceTempView(inputTableName)
        log.debug(s"Registering temporal table in Spark with name: $inputTableName")
      }
      val tempTables = triggers.flatMap(trigger => {
        log.debug(s"Executing query in Spark: ${trigger.sql}")
        val queryDf = Try(sparkSession.sql(trigger.sql)) match {
          case Success(sqlResult) => sqlResult
          case Failure(exception: org.apache.spark.sql.AnalysisException) =>
            log.warn("Warning running analysis in Catalyst in the query ${trigger.sql} in trigger ${trigger.name}",
              exception.message)
            throw DriverException(exception.getMessage, exception)
          case Failure(exception) =>
            log.warn(s"Warning running sql in the query ${trigger.sql} in trigger ${trigger.name}", exception.getMessage)
            throw DriverException(exception.getMessage, exception)
        }
        val extraOptions = Map(Output.TableNameKey -> trigger.name)

        if (!queryDf.rdd.isEmpty()) {
          val autoCalculatedFieldsDf = WriterHelper.write(queryDf, trigger.writerOptions, extraOptions, outputs)
          if (isCorrectTableName(trigger.name) && !sparkSession.catalog.tableExists(trigger.name)) {
            autoCalculatedFieldsDf.createOrReplaceTempView(trigger.name)
            log.debug(s"Registering temporal table in Spark with name: ${trigger.name}")
          }
          else log.warn(s"The trigger ${trigger.name} have incorrect name, is impossible to register as temporal table")

          Option(trigger.name)
        } else None
      })
      tempTables.foreach(tableName =>
        if (isCorrectTableName(tableName) && sparkSession.catalog.tableExists(tableName)) {
          sparkSession.catalog.dropTempView(tableName)
          log.debug(s"Dropping temporal table in Spark with name: $tableName")
        } else log.debug(s"Impossible to drop table in Spark with name: $tableName"))

      if (isCorrectTableName(inputTableName) && sparkSession.catalog.tableExists(inputTableName)) {
        sparkSession.catalog.dropTempView(inputTableName)
        log.debug(s"Dropping temporal table in Spark with name: $inputTableName")
      } else log.debug(s"Impossible to drop table in Spark: $inputTableName")
    } else {
      if (triggers.nonEmpty && !isCorrectTableName(inputTableName))
        log.warn(s"Incorrect table name $inputTableName and the triggers could have errors and not have been " +
          s"executed")
    }
  }

  //scalastyle:on

  private[driver] def isCorrectTableName(tableName: String): Boolean =
    tableName.nonEmpty && tableName != "" &&
      tableName.toLowerCase != "select" &&
      tableName.toLowerCase != "project" &&
      !tableName.contains("-") && !tableName.contains("*") && !tableName.contains("/")
}

Source File: RawDataWriterHelper.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.writer

import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.driver.step.RawData
import com.stratio.sparta.sdk.pipeline.output.Output
import com.stratio.sparta.sdk.utils.AggregationTime
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.{StringType, StructField, StructType, TimestampType}
import org.apache.spark.streaming.dstream.DStream


object RawDataWriterHelper {

  def writeRawData(rawData: RawData, outputs: Seq[Output], input: DStream[Row]): Unit = {
    val RawSchema = StructType(Seq(
      StructField(rawData.timeField, TimestampType, nullable = false),
      StructField(rawData.dataField, StringType, nullable = true)))
    val eventTime = AggregationTime.millisToTimeStamp(System.currentTimeMillis())

    input.map(row => Row.merge(Row(eventTime), row))
      .foreachRDD(rdd => {
        if (!rdd.isEmpty()) {
          val rawDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, RawSchema)

          WriterHelper.write(rawDataFrame, rawData.writerOptions, Map.empty[String, String], outputs)
        }
      })
  }
}

Source File: StreamingTestMethod.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.test

import java.io.Serializable

import scala.language.implicitConversions
import scala.math.pow

import com.twitter.chill.MeatLocker
import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues
import org.apache.commons.math3.stat.inference.TTest

import org.apache.spark.internal.Logging
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.util.StatCounter


private[stat] object StreamingTestMethod {
  // Note: after new `StreamingTestMethod`s are implemented, please update this map.
  private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map(
    "welch" -> WelchTTest,
    "student" -> StudentTTest)

  def getTestMethodFromName(method: String): StreamingTestMethod =
    TEST_NAME_TO_OBJECT.get(method) match {
      case Some(test) => test
      case None =>
        throw new IllegalArgumentException(
          "Unrecognized method name. Supported streaming test methods: "
            + TEST_NAME_TO_OBJECT.keys.mkString(", "))
    }
}

Source File: TransformationsWriterHelper.scala From sparta with Apache License 2.0

5 votes

package com.stratio.sparta.driver.writer

import com.stratio.sparta.driver.factory.SparkContextFactory
import com.stratio.sparta.sdk.pipeline.output.Output
import org.apache.spark.sql.Row
import org.apache.spark.sql.types.StructType
import org.apache.spark.streaming.dstream.DStream

object TransformationsWriterHelper {

  def writeTransformations(input: DStream[Row],
                           inputSchema: StructType,
                           outputs: Seq[Output],
                           writerOptions: WriterOptions): Unit = {
    input.foreachRDD(rdd =>
      if (!rdd.isEmpty()) {
        val transformationsDataFrame = SparkContextFactory.sparkSessionInstance.createDataFrame(rdd, inputSchema)

        WriterHelper.write(transformationsDataFrame, writerOptions, Map.empty[String, String], outputs)
      }
    )
  }
}

Source File: L3-DStreamAggregation.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date

object RedditAggregationApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditAggregationApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val recCount = comments.count()

    val recCountValue = comments.countByValue()

    val totalWords = comments.map(rec => ((parse(rec) \ "body").values.toString))
      .flatMap(body => body.split(" "))
      .map(word => 1)
      .reduce(_ + _)

    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: L3-DStreamWindowAndAction.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.HashPartitioner

object RedditWindowAndActionApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditWindowAndActionApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val checkpointPath = "/tmp"
    ssc.checkpoint(checkpointPath)
    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }
    val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
    val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val distinctSubreddits = comments.map(rec => ((parse(rec)) \ "subreddit").values.toString)
    val windowedRecs = distinctSubreddits.window(Seconds(5), Seconds(5))
    val windowedCounts = windowedRecs.countByValue()

    windowedCounts.print(10)
    windowedCounts.saveAsObjectFiles("subreddit", "obj")
    windowedCounts.saveAsTextFiles("subreddit", "txt")

    globalCount.saveAsHadoopFiles("subreddit", "hadoop",
      classOf[IntWritable], classOf[Text], classOf[TextOutputFormat[IntWritable, Text]])
    globalCount.saveAsNewAPIHadoopFiles("subreddit", "newhadoop",
      classOf[IntWritable], classOf[Text], classOf[NewTextOutputFormat[IntWritable, Text]])
    comments.foreachRDD(rdd => {
      LOG.info("RDD: %s, Count: %d".format(rdd.id, rdd.count()))
    })

    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: L3-DStreamVariation.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date

object RedditVariationApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditVariationApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val merged = comments.union(comments)

    val repartitionedComments = comments.repartition(4)

    val rddMin = comments.glom().map(arr =>
      arr.minBy(rec => ((parse(rec) \ "created_utc").values.toString.toInt)))

    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: L3-DStreamKeyValue.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date
import org.apache.spark.HashPartitioner

object RedditKeyValueApp {
  def main(args: Array[String]) {
    if (args.length != 3) {
      System.err.println(
        "Usage: RedditKeyValueApp <appname> <input_path> <input_path_popular>")
      System.exit(1)
    }
    val Seq(appName, inputPath, inputPathPopular) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val popular = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPathPopular, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val topAuthors = comments.map(rec => ((parse(rec) \ "author").values.toString, 1))
      .groupByKey()
      .map(r => (r._2.sum, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val topAuthors2 = comments.map(rec => ((parse(rec) \ "author").values.toString, 1))
      .reduceByKey(_ + _)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val topAuthorsByAvgContent = comments.map(rec => ((parse(rec) \ "author").values.toString, (parse(rec) \ "body").values.toString.split(" ").length))
      .combineByKey(
        (v) => (v, 1),
        (accValue: (Int, Int), v) => (accValue._1 + v, accValue._2 + 1),
        (accCombine1: (Int, Int), accCombine2: (Int, Int)) => (accCombine1._1 + accCombine2._1, accCombine1._2 + accCombine2._2),
        new HashPartitioner(ssc.sparkContext.defaultParallelism))
      .map({ case (k, v) => (k, v._1 / v._2.toFloat) })
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    val keyedBySubreddit = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec))
    val keyedBySubreddit2 = popular.map(rec => ({
      val t = rec.split(",")
      (t(1).split("/")(4), t(0))
    }))
    val commentsWithIndustry = keyedBySubreddit.join(keyedBySubreddit2)

    val keyedBySubredditCo = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, rec))
    val keyedBySubredditCo2 = popular.map(rec => ({
      val t = rec.split(",")
      (t(1).split("/")(4), t(0))
    }))
    val commentsWithIndustryCo = keyedBySubreddit.cogroup(keyedBySubreddit2)

    val checkpointPath = "/tmp"
    ssc.checkpoint(checkpointPath)
    val updateFunc = (values: Seq[Int], state: Option[Int]) => {
      val currentCount = values.sum
      val previousCount = state.getOrElse(0)
      Some(currentCount + previousCount)
    }
    val keyedBySubredditState = comments.map(rec => (((parse(rec)) \ "subreddit").values.toString, 1))
    val globalCount = keyedBySubredditState.updateStateByKey(updateFunc)
      .map(r => (r._2, r._1))
      .transform(rdd => rdd.sortByKey(ascending = false))

    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: L3-DStreamMapping.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkContext
import org.apache.spark.SparkConf
import org.apache.spark.streaming.{ Milliseconds, Seconds, StreamingContext }
import org.apache.hadoop.io.{ Text, LongWritable, IntWritable }
import org.apache.hadoop.fs.Path
import org.apache.hadoop.mapreduce.lib.input.TextInputFormat
import org.apache.spark.streaming.dstream.DStream
import org.apache.hadoop.mapred.TextOutputFormat
import org.apache.hadoop.mapreduce.lib.output.{ TextOutputFormat => NewTextOutputFormat }
import org.apache.spark.streaming.dstream.PairDStreamFunctions
import org.apache.log4j.LogManager
import org.json4s._
import org.json4s.native.JsonMethods._
import java.text.SimpleDateFormat
import java.util.Date

object RedditMappingApp {
  def main(args: Array[String]) {
    if (args.length != 2) {
      System.err.println(
        "Usage: RedditMappingApp <appname> <input_path>")
      System.exit(1)
    }
    val Seq(appName, inputPath) = args.toSeq
    val LOG = LogManager.getLogger(this.getClass)

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)

    val ssc = new StreamingContext(conf, Seconds(1))
    LOG.info("Started at %d".format(ssc.sparkContext.startTime))

    val comments = ssc.fileStream[LongWritable, Text, TextInputFormat](inputPath, (f: Path) => true, newFilesOnly = false).map(pair => pair._2.toString)

    val sdf = new SimpleDateFormat("yyyy-MM-dd")
    val tsKey = "created_utc"
    val secs = 1000L
    val keyedByDay = comments.map(rec => {
      val ts = (parse(rec) \ tsKey).values
      (sdf.format(new Date(ts.toString.toLong * secs)), rec)
    })

    val keyedByDayPart = comments.mapPartitions(iter => {
      var ret = List[(String, String)]()
      while (iter.hasNext) {
        val rec = iter.next
        val ts = (parse(rec) \ tsKey).values
        ret.::=(sdf.format(new Date(ts.toString.toLong * secs)), rec)
      }
      ret.iterator
    })

    val wordTokens = comments.map(rec => {
      ((parse(rec) \ "body")).values.toString.split(" ")
    })

    val wordTokensFlat = comments.flatMap(rec => {
      ((parse(rec) \ "body")).values.toString.split(" ")
    })

    val filterSubreddit = comments.filter(rec =>
      (parse(rec) \ "subreddit").values.toString.equals("AskReddit"))

    val sortedByAuthor = comments.transform(rdd =>
      (rdd.sortBy(rec => (parse(rec) \ "author").values.toString)))

    ssc.start()
    ssc.awaitTermination()

  }
}

Source File: HttpInputDStream.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import java.util.Timer
import java.util.TimerTask

import scala.reflect.ClassTag

import org.apache.http.client.methods.HttpGet
import org.apache.http.impl.client.CloseableHttpClient
import org.apache.http.impl.client.HttpClients
import org.apache.http.util.EntityUtils
import org.apache.spark.Logging
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.api.java.JavaDStream
import org.apache.spark.streaming.api.java.JavaDStream.fromDStream
import org.apache.spark.streaming.api.java.JavaStreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.ReceiverInputDStream
import org.apache.spark.streaming.receiver.Receiver

class HttpInputDStream(
    @transient ssc_ : StreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends ReceiverInputDStream[String](ssc_) with Logging {

  def getReceiver(): Receiver[String] = {
    new HttpReceiver(storageLevel, url, interval)
  }
}

class HttpReceiver(
    storageLevel: StorageLevel,
    url: String,
    interval: Long) extends Receiver[String](storageLevel) with Logging {

  var httpClient: CloseableHttpClient = _
  var trigger: Timer = _

  def onStop() {
    httpClient.close()
    logInfo("Disconnected from Http Server")
  }

  def onStart() {
    httpClient = HttpClients.createDefault()
    trigger = new Timer()
    trigger.scheduleAtFixedRate(new TimerTask {
      def run() = doGet()
    }, 0, interval * 1000)

    logInfo("Http Receiver initiated")
  }

  def doGet() {
    logInfo("Fetching data from Http source")
    val response = httpClient.execute(new HttpGet(url))
    try {
      val content = EntityUtils.toString(response.getEntity())
      store(content)
    } catch {
      case e: Exception => restart("Error! Problems while connecting", e)
    } finally {
      response.close()
    }

  }

}

object HttpUtils {
  def createStream(
    ssc: StreamingContext,
    storageLevel: StorageLevel = StorageLevel.MEMORY_AND_DISK_SER_2,
    url: String,
    interval: Long): DStream[String] = {
    new HttpInputDStream(ssc, storageLevel, url, interval)
  }

  def createStream(
    jssc: JavaStreamingContext,
    storageLevel: StorageLevel,
    url: String,
    interval: Long): JavaDStream[String] = {
    implicitly[ClassTag[AnyRef]].asInstanceOf[ClassTag[String]]
    createStream(jssc.ssc, storageLevel, url, interval)
  }
}

Source File: L7-2-3Tachyon.scala From prosparkstreaming with Apache License 2.0

5 votes

package org.apress.prospark

import org.apache.spark.SparkConf
import org.apache.spark.SparkContext
import org.apache.spark.rdd.RDD.rddToOrderedRDDFunctions
import org.apache.spark.storage.StorageLevel
import org.apache.spark.streaming.Seconds
import org.apache.spark.streaming.StreamingContext
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.streaming.dstream.DStream.toPairDStreamFunctions

object ReferrerApp {
  def main(args: Array[String]) {
    if (args.length != 7) {
      System.err.println(
        "Usage: ReferrerApp <appname> <hostname> <port> <tachyonUrl> <checkpointDir> <outputPathTop> <outputPathSpark>")
      System.exit(1)
    }
    val Seq(appName, hostname, port, tachyonUrl, checkpointDir, outputPathTop, outputPathSpark) = args.toSeq

    val conf = new SparkConf()
      .setAppName(appName)
      .setJars(SparkContext.jarOfClass(this.getClass).toSeq)
      .set("spark.externalBlockStore.url", tachyonUrl)

    val ssc = new StreamingContext(conf, Seconds(10))
    ssc.checkpoint(checkpointDir)

    val clickstream = ssc.socketTextStream(hostname, port.toInt)
      .map(rec => rec.split("\\t"))
      .persist(StorageLevel.OFF_HEAP)

    val topRefStream = clickstream
      .map(rec => {
        var prev_title = rec(3)
        if (!prev_title.startsWith("other")) {
          prev_title = "wikipedia"
        }
        (prev_title, 1)
      })

    val topSparkStream = clickstream
      .filter(rec => rec(4).equals("Apache_Spark"))
      .map(rec => (rec(3), 1))

    saveTopKeys(topRefStream, outputPathTop)

    saveTopKeys(topSparkStream, outputPathSpark)

    ssc.start()
    ssc.awaitTermination()
  }

  def saveTopKeys(clickstream: DStream[(String, Int)], outputPath: String) {
    clickstream.updateStateByKey((values, state: Option[Int]) => Some(values.sum + state.getOrElse(0)))
      .repartition(1)
      .map(rec => (rec._2, rec._1))
      .transform(rec => rec.sortByKey(ascending = false))
      .saveAsTextFiles(outputPath)
  }

}

org.apache.spark.streaming.dstream.DStream Scala Examples