org.apache.spark.streaming.Time Scala Examples
The following examples show how to use org.apache.spark.streaming.Time.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SqlNetworkWordCount.scala From drizzle-spark with Apache License 2.0 | 6 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 2
Source File: KinesisInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 3
Source File: UnionDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.length == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some(ssc.sc.union(rdds)) } else { None } } }
Example 4
Source File: ForEachDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.streaming.scheduler.Job private[streaming] class ForEachDStream[T: ClassTag] ( parent: DStream[T], foreachFunc: (RDD[T], Time) => Unit, displayInnerRDDOps: Boolean ) extends DStream[Unit](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[Unit]] = None override def generateJob(time: Time): Option[Job] = { parent.getOrCompute(time) match { case Some(rdd) => val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) { foreachFunc(rdd, time) } Some(new Job(time, jobFunc)) case None => None } } }
Example 5
Source File: QueueInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{StreamingContext, Time} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 6
Source File: FlatMappedDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FlatMappedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], flatMapFunc: T => TraversableOnce[U] ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.flatMap(flatMapFunc)) } }
Example 7
Source File: ShuffledDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 8
Source File: FilteredDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FilteredDStream[T: ClassTag]( parent: DStream[T], filterFunc: T => Boolean ) extends DStream[T](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { parent.getOrCompute(validTime).map(_.filter(filterFunc)) } }
Example 9
Source File: FlatMapValuedDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FlatMapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], flatMapValueFunc: V => TraversableOnce[U] ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.flatMapValues[U](flatMapValueFunc)) } }
Example 10
Source File: DStreamCheckpointData.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{IOException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.internal.Logging import org.apache.spark.streaming.Time import org.apache.spark.util.Utils private[streaming] class DStreamCheckpointData[T: ClassTag](dstream: DStream[T]) extends Serializable with Logging { protected val data = new HashMap[Time, AnyRef]() // Mapping of the batch time to the checkpointed RDD file of that time @transient private var timeToCheckpointFile = new HashMap[Time, String] // Mapping of the batch time to the time of the oldest checkpointed RDD // in that batch's checkpoint data @transient private var timeToOldestCheckpointFileTime = new HashMap[Time, Time] @transient private var fileSystem: FileSystem = null protected[streaming] def currentCheckpointFiles = data.asInstanceOf[HashMap[Time, String]] def restore() { // Create RDDs from the checkpoint data currentCheckpointFiles.foreach { case(time, file) => logInfo("Restoring checkpointed RDD for time " + time + " from file '" + file + "'") dstream.generatedRDDs += ((time, dstream.context.sparkContext.checkpointFile[T](file))) } } override def toString: String = { "[\n" + currentCheckpointFiles.size + " checkpoint files \n" + currentCheckpointFiles.mkString("\n") + "\n]" } @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { logDebug(this.getClass().getSimpleName + ".writeObject used") if (dstream.context.graph != null) { dstream.context.graph.synchronized { if (dstream.context.graph.checkpointInProgress) { oos.defaultWriteObject() } else { val msg = "Object of " + this.getClass.getName + " is being serialized " + " possibly as a part of closure of an RDD operation. This is because " + " the DStream object is being referred to from within the closure. " + " Please rewrite the RDD operation inside this DStream to avoid this. " + " This has been enforced to avoid bloating of Spark tasks " + " with unnecessary objects." throw new java.io.NotSerializableException(msg) } } } else { throw new java.io.NotSerializableException( "Graph is unexpectedly null when DStream is being serialized.") } } @throws(classOf[IOException]) private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException { logDebug(this.getClass().getSimpleName + ".readObject used") ois.defaultReadObject() timeToOldestCheckpointFileTime = new HashMap[Time, Time] timeToCheckpointFile = new HashMap[Time, String] } }
Example 11
Source File: MapValuedDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], mapValueFunc: V => U ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.mapValues[U](mapValueFunc)) } }
Example 12
Source File: TransformedDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.nonEmpty, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 13
Source File: MappedDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MappedDStream[T: ClassTag, U: ClassTag] ( parent: DStream[T], mapFunc: T => U ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.map[U](mapFunc)) } }
Example 14
Source File: MapPartitionedDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MapPartitionedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], mapPartFunc: Iterator[T] => Iterator[U], preservePartitioning: Boolean ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.mapPartitions[U](mapPartFunc, preservePartitioning)) } }
Example 15
Source File: BatchUIData.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import scala.collection.mutable import org.apache.spark.streaming.Time import org.apache.spark.streaming.scheduler.{BatchInfo, OutputOperationInfo, StreamInputInfo} import org.apache.spark.streaming.ui.StreamingJobProgressListener._ private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId) private[ui] case class BatchUIData( val batchTime: Time, val streamIdToInputInfo: Map[Int, StreamInputInfo], val submissionTime: Long, val processingStartTime: Option[Long], val processingEndTime: Option[Long], val outputOperations: mutable.HashMap[OutputOpId, OutputOperationUIData] = mutable.HashMap(), var outputOpIdSparkJobIdPairs: Iterable[OutputOpIdAndSparkJobId] = Seq.empty) { def isFailed: Boolean = numFailedOutputOp != 0 } private[ui] object BatchUIData { def apply(batchInfo: BatchInfo): BatchUIData = { val outputOperations = mutable.HashMap[OutputOpId, OutputOperationUIData]() outputOperations ++= batchInfo.outputOperationInfos.mapValues(OutputOperationUIData.apply) new BatchUIData( batchInfo.batchTime, batchInfo.streamIdToInputInfo, batchInfo.submissionTime, batchInfo.processingStartTime, batchInfo.processingEndTime, outputOperations ) } } private[ui] case class OutputOperationUIData( id: OutputOpId, name: String, description: String, startTime: Option[Long], endTime: Option[Long], failureReason: Option[String]) { def duration: Option[Long] = for (s <- startTime; e <- endTime) yield e - s } private[ui] object OutputOperationUIData { def apply(outputOperationInfo: OutputOperationInfo): OutputOperationUIData = { OutputOperationUIData( outputOperationInfo.id, outputOperationInfo.name, outputOperationInfo.description, outputOperationInfo.startTime, outputOperationInfo.endTime, outputOperationInfo.failureReason ) } }
Example 16
Source File: JobSet.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = processingEndTime - time.milliseconds def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (hasStarted) Some(processingStartTime) else None, if (hasCompleted) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 17
Source File: Job.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.util.{Failure, Try} import org.apache.spark.streaming.Time import org.apache.spark.util.{CallSite, Utils} def outputOpId: Int = { if (!isSet) { throw new IllegalStateException("Cannot access number before calling setId") } _outputOpId } def setOutputOpId(outputOpId: Int) { if (isSet) { throw new IllegalStateException("Cannot call setOutputOpId more than once") } isSet = true _id = s"streaming job $time.$outputOpId" _outputOpId = outputOpId } def setCallSite(callSite: CallSite): Unit = { _callSite = callSite } def callSite: CallSite = _callSite def setStartTime(startTime: Long): Unit = { _startTime = Some(startTime) } def setEndTime(endTime: Long): Unit = { _endTime = Some(endTime) } def toOutputOperationInfo: OutputOperationInfo = { val failureReason = if (_result != null && _result.isFailure) { Some(Utils.exceptionString(_result.asInstanceOf[Failure[_]].exception)) } else { None } OutputOperationInfo( time, outputOpId, callSite.shortForm, callSite.longForm, _startTime, _endTime, failureReason) } override def toString: String = id }
Example 18
Source File: InputInfoTrackerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Duration, StreamingContext, Time} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 19
Source File: CheckpointedDirectKafkaInputDStream.scala From streamliner-examples with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kafka prevOffsets = currentOffsets currentOffsets = untilOffsets.map(kv => kv._1 -> kv._2.offset) prevOffsets == currentOffsets match { case false => Some(rdd) case true => None } } def getCurrentOffsets(): Map[TopicAndPartition, Long] = currentOffsets def setCurrentOffsets(offsets: Map[TopicAndPartition, Long]): Unit = { currentOffsets = offsets } }
Example 20
Source File: SqlNetworkWordCount.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 21
Source File: KinesisInputDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 22
Source File: UnionDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.length == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some(ssc.sc.union(rdds)) } else { None } } }
Example 23
Source File: ForEachDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.streaming.scheduler.Job private[streaming] class ForEachDStream[T: ClassTag] ( parent: DStream[T], foreachFunc: (RDD[T], Time) => Unit, displayInnerRDDOps: Boolean ) extends DStream[Unit](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[Unit]] = None override def generateJob(time: Time): Option[Job] = { parent.getOrCompute(time) match { case Some(rdd) => val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) { foreachFunc(rdd, time) } Some(new Job(time, jobFunc)) case None => None } } }
Example 24
Source File: QueueInputDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{StreamingContext, Time} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 25
Source File: FlatMappedDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FlatMappedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], flatMapFunc: T => TraversableOnce[U] ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.flatMap(flatMapFunc)) } }
Example 26
Source File: ShuffledDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 27
Source File: FilteredDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FilteredDStream[T: ClassTag]( parent: DStream[T], filterFunc: T => Boolean ) extends DStream[T](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { parent.getOrCompute(validTime).map(_.filter(filterFunc)) } }
Example 28
Source File: FlatMapValuedDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FlatMapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], flatMapValueFunc: V => TraversableOnce[U] ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.flatMapValues[U](flatMapValueFunc)) } }
Example 29
Source File: MapValuedDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], mapValueFunc: V => U ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.mapValues[U](mapValueFunc)) } }
Example 30
Source File: TransformedDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.nonEmpty, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 31
Source File: MappedDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MappedDStream[T: ClassTag, U: ClassTag] ( parent: DStream[T], mapFunc: T => U ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.map[U](mapFunc)) } }
Example 32
Source File: MapPartitionedDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MapPartitionedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], mapPartFunc: Iterator[T] => Iterator[U], preservePartitioning: Boolean ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.mapPartitions[U](mapPartFunc, preservePartitioning)) } }
Example 33
Source File: BatchUIData.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import scala.collection.mutable import org.apache.spark.streaming.Time import org.apache.spark.streaming.scheduler.{BatchInfo, OutputOperationInfo, StreamInputInfo} import org.apache.spark.streaming.ui.StreamingJobProgressListener._ private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId) private[ui] case class BatchUIData( val batchTime: Time, val streamIdToInputInfo: Map[Int, StreamInputInfo], val submissionTime: Long, val processingStartTime: Option[Long], val processingEndTime: Option[Long], val outputOperations: mutable.HashMap[OutputOpId, OutputOperationUIData] = mutable.HashMap(), var outputOpIdSparkJobIdPairs: Iterable[OutputOpIdAndSparkJobId] = Seq.empty) { def isFailed: Boolean = numFailedOutputOp != 0 } private[ui] object BatchUIData { def apply(batchInfo: BatchInfo): BatchUIData = { val outputOperations = mutable.HashMap[OutputOpId, OutputOperationUIData]() outputOperations ++= batchInfo.outputOperationInfos.mapValues(OutputOperationUIData.apply) new BatchUIData( batchInfo.batchTime, batchInfo.streamIdToInputInfo, batchInfo.submissionTime, batchInfo.processingStartTime, batchInfo.processingEndTime, outputOperations ) } } private[ui] case class OutputOperationUIData( id: OutputOpId, name: String, description: String, startTime: Option[Long], endTime: Option[Long], failureReason: Option[String]) { def duration: Option[Long] = for (s <- startTime; e <- endTime) yield e - s } private[ui] object OutputOperationUIData { def apply(outputOperationInfo: OutputOperationInfo): OutputOperationUIData = { OutputOperationUIData( outputOperationInfo.id, outputOperationInfo.name, outputOperationInfo.description, outputOperationInfo.startTime, outputOperationInfo.endTime, outputOperationInfo.failureReason ) } }
Example 34
Source File: JobSet.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = processingEndTime - time.milliseconds def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (hasStarted) Some(processingStartTime) else None, if (hasCompleted) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 35
Source File: Job.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.util.{Failure, Try} import org.apache.spark.streaming.Time import org.apache.spark.util.{CallSite, Utils} def outputOpId: Int = { if (!isSet) { throw new IllegalStateException("Cannot access number before calling setId") } _outputOpId } def setOutputOpId(outputOpId: Int) { if (isSet) { throw new IllegalStateException("Cannot call setOutputOpId more than once") } isSet = true _id = s"streaming job $time.$outputOpId" _outputOpId = outputOpId } def setCallSite(callSite: CallSite): Unit = { _callSite = callSite } def callSite: CallSite = _callSite def setStartTime(startTime: Long): Unit = { _startTime = Some(startTime) } def setEndTime(endTime: Long): Unit = { _endTime = Some(endTime) } def toOutputOperationInfo: OutputOperationInfo = { val failureReason = if (_result != null && _result.isFailure) { Some(Utils.exceptionString(_result.asInstanceOf[Failure[_]].exception)) } else { None } OutputOperationInfo( time, outputOpId, callSite.shortForm, callSite.longForm, _startTime, _endTime, failureReason) } override def toString: String = id }
Example 36
Source File: InputInfoTrackerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Duration, StreamingContext, Time} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 37
Source File: SavingStream.scala From cuesheet with Apache License 2.0 | 5 votes |
package com.kakao.cuesheet.convert import com.kakao.mango.concurrent.{NamedExecutors, RichExecutorService} import com.kakao.mango.text.ThreadSafeDateFormat import org.apache.spark.rdd.RDD import org.apache.spark.sql.types.StructType import org.apache.spark.sql.{Row, DataFrame} import org.apache.spark.sql.hive.HiveContext import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream import java.util.concurrent.{Future => JFuture} import scala.reflect.runtime.universe.TypeTag object SavingStream { val yyyyMMdd = ThreadSafeDateFormat("yyyy-MM-dd") val hh = ThreadSafeDateFormat("HH") val mm = ThreadSafeDateFormat("mm") val m0 = (ms: Long) => mm(ms).charAt(0) + "0" } @transient var executor: RichExecutorService = _ def ex: RichExecutorService = { if (executor == null) { this.synchronized { if (executor == null) { executor = new RichExecutorService(es.get()) } } } executor } def saveAsPartitionedTable(table: String, path: String, format: String = "orc")(toPartition: Time => Seq[(String, String)]): Unit = { stream.foreachRDD { (rdd, time) => ex.submit { toDF(rdd).appendToExternalTablePartition(table, path, format, toPartition(time): _*) } } } def saveAsDailyPartitionedTable(table: String, path: String, dateColumn: String = "date", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms)) } } def saveAsHourlyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms)) } } def saveAsTenMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> m0(ms)) } } def saveAsMinutelyPartitionedTable(table: String, path: String, dateColumn: String = "date", hourColumn: String = "hour", minuteColumn: String = "minute", format: String = "orc"): Unit = { saveAsPartitionedTable(table, path, format) { time => val ms = time.milliseconds Seq(dateColumn -> yyyyMMdd(ms), hourColumn -> hh(ms), minuteColumn -> mm(ms)) } } } class ProductStream[T <: Product : TypeTag](stream: DStream[T])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[T](stream) { override def toDF(rdd: RDD[T]) = ctx.createDataFrame(rdd) } class JsonStream(stream: DStream[String])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[String](stream) { override def toDF(rdd: RDD[String]) = ctx.read.json(rdd) } class MapStream[T](stream: DStream[Map[String, T]])(implicit ctx: HiveContext, es: ExecutorSupplier) extends SavingStream[Map[String, T]](stream) { import com.kakao.mango.json._ override def toDF(rdd: RDD[Map[String, T]]) = ctx.read.json(rdd.map(toJson)) } class RowStream(stream: DStream[Row])(implicit ctx: HiveContext, es: ExecutorSupplier, schema: StructType) extends SavingStream[Row](stream) { override def toDF(rdd: RDD[Row]): DataFrame = ctx.createDataFrame(rdd, schema) }
Example 38
Source File: ExistingDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.rdd.{EmptyRDD, RDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.analysis.MultiInstanceRelation import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.plans.logical.{LogicalPlan, Statistics} import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] case class PhysicalDStream(output: Seq[Attribute], @transient stream: DStream[InternalRow]) extends SparkPlan with StreamPlan { def children = Nil override def doExecute() = { assert(validTime != null) Utils.invoke(classOf[DStream[InternalRow]], stream, "getOrCompute", (classOf[Time], validTime)) .asInstanceOf[Option[RDD[InternalRow]]] .getOrElse(new EmptyRDD[InternalRow](sparkContext)) } }
Example 39
Source File: StreamPlan.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.streaming.Time import org.apache.spark.streaming.dstream.DStream private[streaming] object StreamPlan { val currentContext = new ThreadLocal[StreamSQLContext]() } trait StreamPlan { protected var validTime: Time = null def streamSqlContext = StreamPlan.currentContext.get() def stream: DStream[InternalRow] def setValidTime(time: Time): Unit = { validTime = time } }
Example 40
Source File: SimpleJsonFileInputDStream.scala From spark-cep with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.streaming import scala.io.Source import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.streaming.dstream.InputDStream import org.apache.spark.streaming.{StreamingContext, Time} class SimpleJsonFileInputDStream ( sqlc: SQLContext, @transient ssc: StreamingContext, path: String) extends InputDStream[String](ssc) { val jsons = Source.fromFile(path).getLines().toList var index = 0 override def start(): Unit = { } override def stop(): Unit = { } override def compute(validTime: Time): Option[RDD[String]] = { val rddOption = Option(ssc.sparkContext.parallelize(List(jsons(index % jsons.size)))) index = index + 1 rddOption } }
Example 41
Source File: CloudantStreaming.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.sql.cloudant import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{Seconds, StreamingContext, Time} import org.apache.bahir.cloudant.CloudantReceiver object CloudantStreaming { def main(args: Array[String]) { val spark = SparkSession.builder() .appName("Cloudant Spark SQL External Datasource in Scala") .master("local[*]") .getOrCreate() // Create the context with a 10 seconds batch size val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) import spark.implicits._ val changes = ssc.receiverStream(new CloudantReceiver(spark.sparkContext.getConf, Map( "cloudant.host" -> "examples.cloudant.com", "database" -> "sales"))) changes.foreachRDD((rdd: RDD[String], time: Time) => { // Get the singleton instance of SparkSession println(s"========= $time =========")// scalastyle:ignore // Convert RDD[String] to Dataset[String] val changesDataFrame = spark.read.json(rdd.toDS()) if (changesDataFrame.schema.nonEmpty) { changesDataFrame.printSchema() var hasDelRecord = false var hasMonth = false for (field <- changesDataFrame.schema.fieldNames) { if ("_deleted".equals(field)) { hasDelRecord = true } if ("month".equals(field)) { hasMonth = true } } if (hasDelRecord) { changesDataFrame.filter(changesDataFrame("_deleted")).select("*").show() } if (hasMonth) { changesDataFrame.filter(changesDataFrame("month") === "May").select("*").show(5) changesDataFrame.createOrReplaceTempView("sales") val salesInMayCountsDataFrame = spark.sql( s""" |select rep, amount |from sales |where month = "May" """.stripMargin) salesInMayCountsDataFrame.show(5) } } }) ssc.start() // run streaming for 60 secs Thread.sleep(60000L) ssc.stop(true) } }
Example 42
Source File: CloudantStreamingSelector.scala From bahir with Apache License 2.0 | 5 votes |
package org.apache.spark.examples.sql.cloudant import java.util.concurrent.atomic.AtomicLong import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.streaming.{ Seconds, StreamingContext, Time } import org.apache.bahir.cloudant.CloudantReceiver object CloudantStreamingSelector { def main(args: Array[String]) { val spark = SparkSession.builder() .appName("Cloudant Spark SQL External Datasource in Scala") .master("local[*]") .getOrCreate() import spark.implicits._ // Create the context with a 10 seconds batch size val ssc = new StreamingContext(spark.sparkContext, Seconds(10)) val curTotalAmount = new AtomicLong(0) val curSalesCount = new AtomicLong(0) var batchAmount = 0L val changes = ssc.receiverStream(new CloudantReceiver(spark.sparkContext.getConf, Map( "cloudant.host" -> "examples.cloudant.com", "database" -> "sales", "selector" -> "{\"month\":\"May\", \"rep\":\"John\"}"))) changes.foreachRDD((rdd: RDD[String], time: Time) => { // Get the singleton instance of SQLContext println(s"========= $time =========") // scalastyle:ignore val changesDataFrame = spark.read.json(rdd.toDS()) if (changesDataFrame.schema.nonEmpty) { changesDataFrame.select("*").show() batchAmount = changesDataFrame.groupBy().sum("amount").collect()(0).getLong(0) curSalesCount.getAndAdd(changesDataFrame.count()) curTotalAmount.getAndAdd(batchAmount) println("Current sales count:" + curSalesCount)// scalastyle:ignore println("Current total amount:" + curTotalAmount)// scalastyle:ignore } else { ssc.stop() } }) ssc.start() ssc.awaitTermination() } }
Example 43
Source File: SqlNetworkWordCount.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 44
Source File: KinesisInputDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 45
Source File: UnionDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.length == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some(ssc.sc.union(rdds)) } else { None } } }
Example 46
Source File: ForEachDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.streaming.scheduler.Job private[streaming] class ForEachDStream[T: ClassTag] ( parent: DStream[T], foreachFunc: (RDD[T], Time) => Unit, displayInnerRDDOps: Boolean ) extends DStream[Unit](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[Unit]] = None override def generateJob(time: Time): Option[Job] = { parent.getOrCompute(time) match { case Some(rdd) => val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) { foreachFunc(rdd, time) } Some(new Job(time, jobFunc)) case None => None } } }
Example 47
Source File: QueueInputDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{StreamingContext, Time} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 48
Source File: FlatMappedDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FlatMappedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], flatMapFunc: T => TraversableOnce[U] ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.flatMap(flatMapFunc)) } }
Example 49
Source File: ShuffledDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 50
Source File: FilteredDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FilteredDStream[T: ClassTag]( parent: DStream[T], filterFunc: T => Boolean ) extends DStream[T](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { parent.getOrCompute(validTime).map(_.filter(filterFunc)) } }
Example 51
Source File: FlatMapValuedDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FlatMapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], flatMapValueFunc: V => TraversableOnce[U] ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.flatMapValues[U](flatMapValueFunc)) } }
Example 52
Source File: MapValuedDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], mapValueFunc: V => U ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.mapValues[U](mapValueFunc)) } }
Example 53
Source File: TransformedDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.nonEmpty, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 54
Source File: MappedDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MappedDStream[T: ClassTag, U: ClassTag] ( parent: DStream[T], mapFunc: T => U ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.map[U](mapFunc)) } }
Example 55
Source File: MapPartitionedDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MapPartitionedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], mapPartFunc: Iterator[T] => Iterator[U], preservePartitioning: Boolean ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.mapPartitions[U](mapPartFunc, preservePartitioning)) } }
Example 56
Source File: BatchUIData.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import scala.collection.mutable import org.apache.spark.streaming.Time import org.apache.spark.streaming.scheduler.{BatchInfo, OutputOperationInfo, StreamInputInfo} import org.apache.spark.streaming.ui.StreamingJobProgressListener._ private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId) private[ui] case class BatchUIData( val batchTime: Time, val streamIdToInputInfo: Map[Int, StreamInputInfo], val submissionTime: Long, val processingStartTime: Option[Long], val processingEndTime: Option[Long], val outputOperations: mutable.HashMap[OutputOpId, OutputOperationUIData] = mutable.HashMap(), var outputOpIdSparkJobIdPairs: Iterable[OutputOpIdAndSparkJobId] = Seq.empty) { def isFailed: Boolean = numFailedOutputOp != 0 } private[ui] object BatchUIData { def apply(batchInfo: BatchInfo): BatchUIData = { val outputOperations = mutable.HashMap[OutputOpId, OutputOperationUIData]() outputOperations ++= batchInfo.outputOperationInfos.mapValues(OutputOperationUIData.apply) new BatchUIData( batchInfo.batchTime, batchInfo.streamIdToInputInfo, batchInfo.submissionTime, batchInfo.processingStartTime, batchInfo.processingEndTime, outputOperations ) } } private[ui] case class OutputOperationUIData( id: OutputOpId, name: String, description: String, startTime: Option[Long], endTime: Option[Long], failureReason: Option[String]) { def duration: Option[Long] = for (s <- startTime; e <- endTime) yield e - s } private[ui] object OutputOperationUIData { def apply(outputOperationInfo: OutputOperationInfo): OutputOperationUIData = { OutputOperationUIData( outputOperationInfo.id, outputOperationInfo.name, outputOperationInfo.description, outputOperationInfo.startTime, outputOperationInfo.endTime, outputOperationInfo.failureReason ) } }
Example 57
Source File: JobSet.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = processingEndTime - time.milliseconds def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (hasStarted) Some(processingStartTime) else None, if (hasCompleted) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 58
Source File: Job.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.util.{Failure, Try} import org.apache.spark.streaming.Time import org.apache.spark.util.{CallSite, Utils} def outputOpId: Int = { if (!isSet) { throw new IllegalStateException("Cannot access number before calling setId") } _outputOpId } def setOutputOpId(outputOpId: Int) { if (isSet) { throw new IllegalStateException("Cannot call setOutputOpId more than once") } isSet = true _id = s"streaming job $time.$outputOpId" _outputOpId = outputOpId } def setCallSite(callSite: CallSite): Unit = { _callSite = callSite } def callSite: CallSite = _callSite def setStartTime(startTime: Long): Unit = { _startTime = Some(startTime) } def setEndTime(endTime: Long): Unit = { _endTime = Some(endTime) } def toOutputOperationInfo: OutputOperationInfo = { val failureReason = if (_result != null && _result.isFailure) { Some(Utils.exceptionString(_result.asInstanceOf[Failure[_]].exception)) } else { None } OutputOperationInfo( time, outputOpId, callSite.shortForm, callSite.longForm, _startTime, _endTime, failureReason) } override def toString: String = id }
Example 59
Source File: InputInfoTrackerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Duration, StreamingContext, Time} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 60
Source File: UnionDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.rdd.UnionRDD import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new Exception("Could not generate RDD from a parent for unifying at time " + validTime) } if (rdds.size > 0) { Some(new UnionRDD(ssc.sc, rdds)) } else { None } } }
Example 61
Source File: ForEachDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.streaming.scheduler.Job import scala.reflect.ClassTag private[streaming] class ForEachDStream[T: ClassTag] ( parent: DStream[T], foreachFunc: (RDD[T], Time) => Unit ) extends DStream[Unit](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[Unit]] = None override def generateJob(time: Time): Option[Job] = { parent.getOrCompute(time) match { case Some(rdd) => val jobFunc = () => createRDDWithLocalProperties(time) { ssc.sparkContext.setCallSite(creationSite) foreachFunc(rdd, time) } Some(new Job(time, jobFunc)) case None => None } } }
Example 62
Source File: QueueInputDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{Time, StreamingContext} private[streaming] class QueueInputDStream[T: ClassTag]( @transient ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() if (oneAtATime && queue.size > 0) { buffer += queue.dequeue() } else { buffer ++= queue.dequeueAll(_ => true) } if (buffer.size > 0) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(ssc.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { None } } }
Example 63
Source File: FlatMappedDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class FlatMappedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], flatMapFunc: T => Traversable[U] ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.flatMap(flatMapFunc)) } }
Example 64
Source File: ShuffledDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 65
Source File: FilteredDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class FilteredDStream[T: ClassTag]( parent: DStream[T], filterFunc: T => Boolean ) extends DStream[T](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { parent.getOrCompute(validTime).map(_.filter(filterFunc)) } }
Example 66
Source File: FlatMapValuedDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import scala.reflect.ClassTag private[streaming] class FlatMapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], flatMapValueFunc: V => TraversableOnce[U] ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.flatMapValues[U](flatMapValueFunc)) } }
Example 67
Source File: MapValuedDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import scala.reflect.ClassTag private[streaming] class MapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], mapValueFunc: V => U ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.mapValues[U](mapValueFunc)) } }
Example 68
Source File: TransformedDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.{PairRDDFunctions, RDD} import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.length > 0, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq Some(transformFunc(parentRDDs, validTime)) } }
Example 69
Source File: MappedDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class MappedDStream[T: ClassTag, U: ClassTag] ( parent: DStream[T], mapFunc: T => U ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.map[U](mapFunc)) } }
Example 70
Source File: MapPartitionedDStream.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class MapPartitionedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], mapPartFunc: Iterator[T] => Iterator[U], preservePartitioning: Boolean ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.mapPartitions[U](mapPartFunc, preservePartitioning)) } }
Example 71
Source File: BatchUIData.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.streaming.Time import org.apache.spark.streaming.scheduler.BatchInfo import org.apache.spark.streaming.ui.StreamingJobProgressListener._ private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId) private[ui] case class BatchUIData( val batchTime: Time, val streamIdToNumRecords: Map[Int, Long], val submissionTime: Long, val processingStartTime: Option[Long], val processingEndTime: Option[Long], var outputOpIdSparkJobIdPairs: Seq[OutputOpIdAndSparkJobId] = Seq.empty) { def numRecords: Long = streamIdToNumRecords.values.sum } private[ui] object BatchUIData { def apply(batchInfo: BatchInfo): BatchUIData = { new BatchUIData( batchInfo.batchTime, batchInfo.streamIdToNumRecords, batchInfo.submissionTime, batchInfo.processingStartTime, batchInfo.processingEndTime ) } }
Example 72
Source File: JobSet.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToNumRecords: Map[Int, Long] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = { processingEndTime - time.milliseconds } def toBatchInfo: BatchInfo = { new BatchInfo( time, streamIdToNumRecords, submissionTime, if (processingStartTime >= 0 ) Some(processingStartTime) else None, if (processingEndTime >= 0 ) Some(processingEndTime) else None ) } }
Example 73
package org.apache.spark.streaming.scheduler import org.apache.spark.streaming.Time import scala.util.Try def outputOpId: Int = { if (!isSet) { throw new IllegalStateException("Cannot access number before calling setId") } _outputOpId } def setOutputOpId(outputOpId: Int) { if (isSet) { throw new IllegalStateException("Cannot call setOutputOpId more than once") } isSet = true _id = s"streaming job $time.$outputOpId" _outputOpId = outputOpId } override def toString: String = id }
Example 74
Source File: InputInfoTrackerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Time, Duration, StreamingContext} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = InputInfo(streamId1, 100L) val inputInfo2 = InputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = InputInfo(streamId1, 100L) val inputInfo2 = InputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 75
Source File: SqlNetworkWordCount.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Time, Seconds, StreamingContext} import org.apache.spark.util.IntParam import org.apache.spark.sql.SQLContext import org.apache.spark.storage.StorageLevel object SQLContextSingleton { @transient private var instance: SQLContext = _ def getInstance(sparkContext: SparkContext): SQLContext = { if (instance == null) { instance = new SQLContext(sparkContext) } instance } } // scalastyle:on println
Example 76
Source File: KinesisInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[Array[Byte]](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[Array[Byte]] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[Array[Byte]] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, awsCredentialsOption) } }
Example 77
Source File: UnionDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.rdd.UnionRDD import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new Exception("Could not generate RDD from a parent for unifying at time " + validTime) } if (rdds.size > 0) { Some(new UnionRDD(ssc.sc, rdds)) } else { None } } }
Example 78
Source File: ForEachDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.streaming.scheduler.Job import scala.reflect.ClassTag private[streaming] //每个DStream class ForEachDStream[T: ClassTag] ( parent: DStream[T], foreachFunc: (RDD[T], Time) => Unit ) extends DStream[Unit](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) //持续时间 override def slideDuration: Duration = parent.slideDuration //None被声明为一个对象,而不是一个类,在没有值的时候,使用None,如果有值可以引用,就使用Some来包含这个值,都是Option的子类 override def compute(validTime: Time): Option[RDD[Unit]] = None override def generateJob(time: Time): Option[Job] = { parent.getOrCompute(time) match { case Some(rdd) => val jobFunc = () => createRDDWithLocalProperties(time) { ssc.sparkContext.setCallSite(creationSite) foreachFunc(rdd, time) } Some(new Job(time, jobFunc)) //None被声明为一个对象,而不是一个类,在没有值的时候,使用None,如果有值可以引用,就使用Some来包含这个值,都是Option的子类 case None => None } } }
Example 79
Source File: QueueInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{Time, StreamingContext} private[streaming] class QueueInputDStream[T: ClassTag]( @transient ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() if (oneAtATime && queue.size > 0) { buffer += queue.dequeue() } else { buffer ++= queue.dequeueAll(_ => true) } if (buffer.size > 0) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(ssc.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { None } } }
Example 80
Source File: FlatMappedDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class FlatMappedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], flatMapFunc: T => Traversable[U] ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.flatMap(flatMapFunc)) } }
Example 81
Source File: ShuffledDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 82
Source File: FilteredDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class FilteredDStream[T: ClassTag]( parent: DStream[T], filterFunc: T => Boolean ) extends DStream[T](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { parent.getOrCompute(validTime).map(_.filter(filterFunc)) } }
Example 83
Source File: FlatMapValuedDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import scala.reflect.ClassTag private[streaming] class FlatMapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], flatMapValueFunc: V => TraversableOnce[U] ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.flatMapValues[U](flatMapValueFunc)) } }
Example 84
Source File: MapValuedDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import scala.reflect.ClassTag private[streaming] class MapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], mapValueFunc: V => U ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.mapValues[U](mapValueFunc)) } }
Example 85
Source File: TransformedDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.{PairRDDFunctions, RDD} import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.length > 0, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map(_.getOrCompute(validTime).orNull).toSeq Some(transformFunc(parentRDDs, validTime)) } }
Example 86
Source File: MappedDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class MappedDStream[T: ClassTag, U: ClassTag] ( parent: DStream[T], mapFunc: T => U ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.map[U](mapFunc)) } }
Example 87
Source File: MapPartitionedDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class MapPartitionedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], mapPartFunc: Iterator[T] => Iterator[U], preservePartitioning: Boolean ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.mapPartitions[U](mapPartFunc, preservePartitioning)) } }
Example 88
Source File: BatchUIData.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import org.apache.spark.streaming.Time import org.apache.spark.streaming.scheduler.{BatchInfo, StreamInputInfo} import org.apache.spark.streaming.ui.StreamingJobProgressListener._ private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId) private[ui] case class BatchUIData( val batchTime: Time, val streamIdToInputInfo: Map[Int, StreamInputInfo], val submissionTime: Long, val processingStartTime: Option[Long], val processingEndTime: Option[Long], val numOutputOp: Int, val failureReason: Map[Int, String], var outputOpIdSparkJobIdPairs: Seq[OutputOpIdAndSparkJobId] = Seq.empty) { def numRecords: Long = streamIdToInputInfo.values.map(_.numRecords).sum } private[ui] object BatchUIData { def apply(batchInfo: BatchInfo): BatchUIData = { new BatchUIData( batchInfo.batchTime, batchInfo.streamIdToInputInfo, batchInfo.submissionTime, batchInfo.processingStartTime, batchInfo.processingEndTime, batchInfo.numOutputOp, batchInfo.failureReasons ) } }
Example 89
Source File: JobSet.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import scala.util.Failure import org.apache.spark.streaming.Time import org.apache.spark.util.Utils private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() //当这jobset提交 private val submissionTime = System.currentTimeMillis() // when this jobset was submitted //当这jobset第一份工作开始处理 private var processingStartTime = -1L // when the first job of this jobset started processing //当这jobset最后的工作处理完 private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing //从他们开始处理的时间来处理所有的工作 // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted //从提交的时间来处理所有的工作时间 // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = { processingEndTime - time.milliseconds } def toBatchInfo: BatchInfo = { val failureReasons: Map[Int, String] = { if (hasCompleted) { jobs.filter(_.result.isFailure).map { job => (job.outputOpId, Utils.exceptionString(job.result.asInstanceOf[Failure[_]].exception)) }.toMap } else { Map.empty } } val binfo = new BatchInfo( time, streamIdToInputInfo, submissionTime, if (processingStartTime >= 0) Some(processingStartTime) else None, if (processingEndTime >= 0) Some(processingEndTime) else None ) binfo.setFailureReason(failureReasons) binfo.setNumOutputOp(jobs.size) binfo } }
Example 90
package org.apache.spark.streaming.scheduler import org.apache.spark.streaming.Time import scala.util.Try def outputOpId: Int = { if (!isSet) { throw new IllegalStateException("Cannot access number before calling setId") } _outputOpId } def setOutputOpId(outputOpId: Int) { if (isSet) { throw new IllegalStateException("Cannot call setOutputOpId more than once") } isSet = true _id = s"streaming job $time.$outputOpId" _outputOpId = outputOpId } override def toString: String = id }
Example 91
Source File: SqlNetworkWordCount.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.rdd.RDD import org.apache.spark.sql.SparkSession import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} object SparkSessionSingleton { @transient private var instance: SparkSession = _ def getInstance(sparkConf: SparkConf): SparkSession = { if (instance == null) { instance = SparkSession .builder .config(sparkConf) .getOrCreate() } instance } } // scalastyle:on println
Example 92
Source File: UnionDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.length == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.length == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.nonEmpty) { Some(ssc.sc.union(rdds)) } else { None } } }
Example 93
Source File: ForEachDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.streaming.scheduler.Job private[streaming] class ForEachDStream[T: ClassTag] ( parent: DStream[T], foreachFunc: (RDD[T], Time) => Unit, displayInnerRDDOps: Boolean ) extends DStream[Unit](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[Unit]] = None override def generateJob(time: Time): Option[Job] = { parent.getOrCompute(time) match { case Some(rdd) => val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) { foreachFunc(rdd, time) } Some(new Job(time, jobFunc)) case None => None } } }
Example 94
Source File: QueueInputDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{StreamingContext, Time} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() queue.synchronized { if (oneAtATime && queue.nonEmpty) { buffer += queue.dequeue() } else { buffer ++= queue queue.clear() } } if (buffer.nonEmpty) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 95
Source File: FlatMappedDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FlatMappedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], flatMapFunc: T => TraversableOnce[U] ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.flatMap(flatMapFunc)) } }
Example 96
Source File: ShuffledDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 97
Source File: FilteredDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FilteredDStream[T: ClassTag]( parent: DStream[T], filterFunc: T => Boolean ) extends DStream[T](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { parent.getOrCompute(validTime).map(_.filter(filterFunc)) } }
Example 98
Source File: FlatMapValuedDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class FlatMapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], flatMapValueFunc: V => TraversableOnce[U] ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.flatMapValues[U](flatMapValueFunc)) } }
Example 99
Source File: MapValuedDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], mapValueFunc: V => U ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.mapValues[U](mapValueFunc)) } }
Example 100
Source File: TransformedDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.nonEmpty, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { // 针对每一个流,获取其当前时间的RDD。 val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 101
Source File: MappedDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MappedDStream[T: ClassTag, U: ClassTag] ( parent: DStream[T], mapFunc: T => U ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.map[U](mapFunc)) } }
Example 102
Source File: MapPartitionedDStream.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class MapPartitionedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], mapPartFunc: Iterator[T] => Iterator[U], preservePartitioning: Boolean ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.mapPartitions[U](mapPartFunc, preservePartitioning)) } }
Example 103
Source File: BatchUIData.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import scala.collection.mutable import org.apache.spark.streaming.Time import org.apache.spark.streaming.scheduler.{BatchInfo, OutputOperationInfo, StreamInputInfo} import org.apache.spark.streaming.ui.StreamingJobProgressListener._ private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId) private[ui] case class BatchUIData( val batchTime: Time, val streamIdToInputInfo: Map[Int, StreamInputInfo], val submissionTime: Long, val processingStartTime: Option[Long], val processingEndTime: Option[Long], val outputOperations: mutable.HashMap[OutputOpId, OutputOperationUIData] = mutable.HashMap(), var outputOpIdSparkJobIdPairs: Iterable[OutputOpIdAndSparkJobId] = Seq.empty) { def isFailed: Boolean = numFailedOutputOp != 0 } private[ui] object BatchUIData { def apply(batchInfo: BatchInfo): BatchUIData = { val outputOperations = mutable.HashMap[OutputOpId, OutputOperationUIData]() outputOperations ++= batchInfo.outputOperationInfos.mapValues(OutputOperationUIData.apply) new BatchUIData( batchInfo.batchTime, batchInfo.streamIdToInputInfo, batchInfo.submissionTime, batchInfo.processingStartTime, batchInfo.processingEndTime, outputOperations ) } } private[ui] case class OutputOperationUIData( id: OutputOpId, name: String, description: String, startTime: Option[Long], endTime: Option[Long], failureReason: Option[String]) { def duration: Option[Long] = for (s <- startTime; e <- endTime) yield e - s } private[ui] object OutputOperationUIData { def apply(outputOperationInfo: OutputOperationInfo): OutputOperationUIData = { OutputOperationUIData( outputOperationInfo.id, outputOperationInfo.name, outputOperationInfo.description, outputOperationInfo.startTime, outputOperationInfo.endTime, outputOperationInfo.failureReason ) } }
Example 104
Source File: JobSet.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = processingEndTime - time.milliseconds def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (hasStarted) Some(processingStartTime) else None, if (hasCompleted) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 105
Source File: Job.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.util.{Failure, Try} import org.apache.spark.streaming.Time import org.apache.spark.util.{CallSite, Utils} def outputOpId: Int = { if (!isSet) { throw new IllegalStateException("Cannot access number before calling setId") } _outputOpId } def setOutputOpId(outputOpId: Int) { if (isSet) { throw new IllegalStateException("Cannot call setOutputOpId more than once") } isSet = true _id = s"streaming job $time.$outputOpId" _outputOpId = outputOpId } def setCallSite(callSite: CallSite): Unit = { _callSite = callSite } def callSite: CallSite = _callSite def setStartTime(startTime: Long): Unit = { _startTime = Some(startTime) } def setEndTime(endTime: Long): Unit = { _endTime = Some(endTime) } def toOutputOperationInfo: OutputOperationInfo = { val failureReason = if (_result != null && _result.isFailure) { Some(Utils.exceptionString(_result.asInstanceOf[Failure[_]].exception)) } else { None } OutputOperationInfo( time, outputOpId, callSite.shortForm, callSite.longForm, _startTime, _endTime, failureReason) } override def toString: String = id }
Example 106
Source File: InputInfoTrackerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Duration, StreamingContext, Time} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 107
Source File: SqlNetworkWordCount.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.streaming import org.apache.spark.SparkConf import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Time, Seconds, StreamingContext} import org.apache.spark.util.IntParam import org.apache.spark.sql.SQLContext import org.apache.spark.storage.StorageLevel object SQLContextSingleton { @transient private var instance: SQLContext = _ def getInstance(sparkContext: SparkContext): SQLContext = { if (instance == null) { instance = new SQLContext(sparkContext) } instance } } // scalastyle:on println
Example 108
Source File: KinesisInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream[T: ClassTag]( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 109
Source File: UnionDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.rdd.UnionRDD private[streaming] class UnionDStream[T: ClassTag](parents: Array[DStream[T]]) extends DStream[T](parents.head.ssc) { require(parents.length > 0, "List of DStreams to union is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { val rdds = new ArrayBuffer[RDD[T]]() parents.map(_.getOrCompute(validTime)).foreach { case Some(rdd) => rdds += rdd case None => throw new SparkException("Could not generate RDD from a parent for unifying at" + s" time $validTime") } if (rdds.size > 0) { Some(new UnionRDD(ssc.sc, rdds)) } else { None } } }
Example 110
Source File: ForEachDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.streaming.scheduler.Job import scala.reflect.ClassTag private[streaming] class ForEachDStream[T: ClassTag] ( parent: DStream[T], foreachFunc: (RDD[T], Time) => Unit, displayInnerRDDOps: Boolean ) extends DStream[Unit](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[Unit]] = None override def generateJob(time: Time): Option[Job] = { parent.getOrCompute(time) match { case Some(rdd) => val jobFunc = () => createRDDWithLocalProperties(time, displayInnerRDDOps) { foreachFunc(rdd, time) } Some(new Job(time, jobFunc)) case None => None } } }
Example 111
Source File: QueueInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import java.io.{NotSerializableException, ObjectInputStream, ObjectOutputStream} import scala.collection.mutable.{ArrayBuffer, Queue} import scala.reflect.ClassTag import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.streaming.{Time, StreamingContext} private[streaming] class QueueInputDStream[T: ClassTag]( ssc: StreamingContext, val queue: Queue[RDD[T]], oneAtATime: Boolean, defaultRDD: RDD[T] ) extends InputDStream[T](ssc) { override def start() { } override def stop() { } private def readObject(in: ObjectInputStream): Unit = { throw new NotSerializableException("queueStream doesn't support checkpointing. " + "Please don't use queueStream when checkpointing is enabled.") } private def writeObject(oos: ObjectOutputStream): Unit = { logWarning("queueStream doesn't support checkpointing") } override def compute(validTime: Time): Option[RDD[T]] = { val buffer = new ArrayBuffer[RDD[T]]() if (oneAtATime && queue.size > 0) { buffer += queue.dequeue() } else { buffer ++= queue.dequeueAll(_ => true) } if (buffer.size > 0) { if (oneAtATime) { Some(buffer.head) } else { Some(new UnionRDD(context.sc, buffer.toSeq)) } } else if (defaultRDD != null) { Some(defaultRDD) } else { Some(ssc.sparkContext.emptyRDD) } } }
Example 112
Source File: FlatMappedDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class FlatMappedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], flatMapFunc: T => Traversable[U] ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.flatMap(flatMapFunc)) } }
Example 113
Source File: ShuffledDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import org.apache.spark.streaming.{Duration, Time} import scala.reflect.ClassTag private[streaming] class ShuffledDStream[K: ClassTag, V: ClassTag, C: ClassTag]( parent: DStream[(K, V)], createCombiner: V => C, mergeValue: (C, V) => C, mergeCombiner: (C, C) => C, partitioner: Partitioner, mapSideCombine: Boolean = true ) extends DStream[(K, C)] (parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, C)]] = { parent.getOrCompute(validTime) match { case Some(rdd) => Some(rdd.combineByKey[C]( createCombiner, mergeValue, mergeCombiner, partitioner, mapSideCombine)) case None => None } } }
Example 114
Source File: FilteredDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class FilteredDStream[T: ClassTag]( parent: DStream[T], filterFunc: T => Boolean ) extends DStream[T](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[T]] = { parent.getOrCompute(validTime).map(_.filter(filterFunc)) } }
Example 115
Source File: FlatMapValuedDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import scala.reflect.ClassTag private[streaming] class FlatMapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], flatMapValueFunc: V => TraversableOnce[U] ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.flatMapValues[U](flatMapValueFunc)) } }
Example 116
Source File: MapValuedDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ import scala.reflect.ClassTag private[streaming] class MapValuedDStream[K: ClassTag, V: ClassTag, U: ClassTag]( parent: DStream[(K, V)], mapValueFunc: V => U ) extends DStream[(K, U)](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[(K, U)]] = { parent.getOrCompute(validTime).map(_.mapValues[U](mapValueFunc)) } }
Example 117
Source File: TransformedDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import scala.reflect.ClassTag import org.apache.spark.SparkException import org.apache.spark.rdd.RDD import org.apache.spark.streaming.{Duration, Time} private[streaming] class TransformedDStream[U: ClassTag] ( parents: Seq[DStream[_]], transformFunc: (Seq[RDD[_]], Time) => RDD[U] ) extends DStream[U](parents.head.ssc) { require(parents.length > 0, "List of DStreams to transform is empty") require(parents.map(_.ssc).distinct.size == 1, "Some of the DStreams have different contexts") require(parents.map(_.slideDuration).distinct.size == 1, "Some of the DStreams have different slide durations") override def dependencies: List[DStream[_]] = parents.toList override def slideDuration: Duration = parents.head.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { val parentRDDs = parents.map { parent => parent.getOrCompute(validTime).getOrElse( // Guard out against parent DStream that return None instead of Some(rdd) to avoid NPE throw new SparkException(s"Couldn't generate RDD from parent at time $validTime")) } val transformedRDD = transformFunc(parentRDDs, validTime) if (transformedRDD == null) { throw new SparkException("Transform function must not return null. " + "Return SparkContext.emptyRDD() instead to represent no element " + "as the result of transformation.") } Some(transformedRDD) } override protected[streaming] def createRDDWithLocalProperties[U]( time: Time, displayInnerRDDOps: Boolean)(body: => U): U = { super.createRDDWithLocalProperties(time, displayInnerRDDOps = true)(body) } }
Example 118
Source File: MappedDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class MappedDStream[T: ClassTag, U: ClassTag] ( parent: DStream[T], mapFunc: T => U ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.map[U](mapFunc)) } }
Example 119
Source File: MapPartitionedDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.dstream import org.apache.spark.streaming.{Duration, Time} import org.apache.spark.rdd.RDD import scala.reflect.ClassTag private[streaming] class MapPartitionedDStream[T: ClassTag, U: ClassTag]( parent: DStream[T], mapPartFunc: Iterator[T] => Iterator[U], preservePartitioning: Boolean ) extends DStream[U](parent.ssc) { override def dependencies: List[DStream[_]] = List(parent) override def slideDuration: Duration = parent.slideDuration override def compute(validTime: Time): Option[RDD[U]] = { parent.getOrCompute(validTime).map(_.mapPartitions[U](mapPartFunc, preservePartitioning)) } }
Example 120
Source File: BatchUIData.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.ui import scala.collection.mutable import org.apache.spark.streaming.Time import org.apache.spark.streaming.scheduler.{BatchInfo, OutputOperationInfo, StreamInputInfo} import org.apache.spark.streaming.ui.StreamingJobProgressListener._ private[ui] case class OutputOpIdAndSparkJobId(outputOpId: OutputOpId, sparkJobId: SparkJobId) private[ui] case class BatchUIData( val batchTime: Time, val streamIdToInputInfo: Map[Int, StreamInputInfo], val submissionTime: Long, val processingStartTime: Option[Long], val processingEndTime: Option[Long], val outputOperations: mutable.HashMap[OutputOpId, OutputOperationUIData] = mutable.HashMap(), var outputOpIdSparkJobIdPairs: Seq[OutputOpIdAndSparkJobId] = Seq.empty) { def isFailed: Boolean = numFailedOutputOp != 0 } private[ui] object BatchUIData { def apply(batchInfo: BatchInfo): BatchUIData = { val outputOperations = mutable.HashMap[OutputOpId, OutputOperationUIData]() outputOperations ++= batchInfo.outputOperationInfos.mapValues(OutputOperationUIData.apply) new BatchUIData( batchInfo.batchTime, batchInfo.streamIdToInputInfo, batchInfo.submissionTime, batchInfo.processingStartTime, batchInfo.processingEndTime, outputOperations ) } } private[ui] case class OutputOperationUIData( id: OutputOpId, name: String, description: String, startTime: Option[Long], endTime: Option[Long], failureReason: Option[String]) { def duration: Option[Long] = for (s <- startTime; e <- endTime) yield e - s } private[ui] object OutputOperationUIData { def apply(outputOperationInfo: OutputOperationInfo): OutputOperationUIData = { OutputOperationUIData( outputOperationInfo.id, outputOperationInfo.name, outputOperationInfo.description, outputOperationInfo.startTime, outputOperationInfo.endTime, outputOperationInfo.failureReason ) } }
Example 121
Source File: JobSet.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import scala.util.Failure import org.apache.spark.streaming.Time import org.apache.spark.util.Utils private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = { processingEndTime - time.milliseconds } def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (processingStartTime >= 0) Some(processingStartTime) else None, if (processingEndTime >= 0) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 122
Source File: Job.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.util.{Failure, Try} import org.apache.spark.streaming.Time import org.apache.spark.util.{Utils, CallSite} def outputOpId: Int = { if (!isSet) { throw new IllegalStateException("Cannot access number before calling setId") } _outputOpId } def setOutputOpId(outputOpId: Int) { if (isSet) { throw new IllegalStateException("Cannot call setOutputOpId more than once") } isSet = true _id = s"streaming job $time.$outputOpId" _outputOpId = outputOpId } def setCallSite(callSite: CallSite): Unit = { _callSite = callSite } def callSite: CallSite = _callSite def setStartTime(startTime: Long): Unit = { _startTime = Some(startTime) } def setEndTime(endTime: Long): Unit = { _endTime = Some(endTime) } def toOutputOperationInfo: OutputOperationInfo = { val failureReason = if (_result != null && _result.isFailure) { Some(Utils.exceptionString(_result.asInstanceOf[Failure[_]].exception)) } else { None } OutputOperationInfo( time, outputOpId, callSite.shortForm, callSite.longForm, _startTime, _endTime, failureReason) } override def toString: String = id }
Example 123
Source File: InputInfoTrackerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import org.scalatest.BeforeAndAfter import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.streaming.{Time, Duration, StreamingContext} class InputInfoTrackerSuite extends SparkFunSuite with BeforeAndAfter { private var ssc: StreamingContext = _ before { val conf = new SparkConf().setMaster("local[2]").setAppName("DirectStreamTacker") if (ssc == null) { ssc = new StreamingContext(conf, Duration(1000)) } } after { if (ssc != null) { ssc.stop() ssc = null } } test("test report and get InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val streamId2 = 1 val time = Time(0L) val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId2, 300L) inputInfoTracker.reportInfo(time, inputInfo1) inputInfoTracker.reportInfo(time, inputInfo2) val batchTimeToInputInfos = inputInfoTracker.getInfo(time) assert(batchTimeToInputInfos.size == 2) assert(batchTimeToInputInfos.keys === Set(streamId1, streamId2)) assert(batchTimeToInputInfos(streamId1) === inputInfo1) assert(batchTimeToInputInfos(streamId2) === inputInfo2) assert(inputInfoTracker.getInfo(time)(streamId1) === inputInfo1) } test("test cleanup InputInfo from InputInfoTracker") { val inputInfoTracker = new InputInfoTracker(ssc) val streamId1 = 0 val inputInfo1 = StreamInputInfo(streamId1, 100L) val inputInfo2 = StreamInputInfo(streamId1, 300L) inputInfoTracker.reportInfo(Time(0), inputInfo1) inputInfoTracker.reportInfo(Time(1), inputInfo2) inputInfoTracker.cleanup(Time(0)) assert(inputInfoTracker.getInfo(Time(0))(streamId1) === inputInfo1) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) inputInfoTracker.cleanup(Time(1)) assert(inputInfoTracker.getInfo(Time(0)).get(streamId1) === None) assert(inputInfoTracker.getInfo(Time(1))(streamId1) === inputInfo2) } }
Example 124
Source File: 7_RecoverableNetworkWordCount.scala From wow-spark with MIT License | 5 votes |
package com.sev7e0.wow.spark_streaming import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import org.apache.spark.streaming.{Seconds, StreamingContext, Time} import org.apache.spark.util.LongAccumulator import org.apache.spark.{SparkConf, SparkContext} object RecoverableNetworkWordCount { def main(args: Array[String]): Unit = { StreamingLogger.setLoggerLevel() val conf = new SparkConf().setMaster("local").setAppName(RecoverableNetworkWordCount.getClass.getName) val context = new StreamingContext(conf, Seconds(1)) val linesDS = context.socketTextStream("localhost", 9999, StorageLevel.MEMORY_AND_DISK_2) val wordsCounts = linesDS.flatMap(_.split(" ")).map(word => (word, 1)).reduceByKey(_ + _) wordsCounts.foreachRDD((rdd: RDD[(String, Int)], time: Time) => { val blackList = WordBlackList.getInstance(context.sparkContext) val accumulator = DropWordCounter.getInstance(context.sparkContext) val str = rdd.filter { case (word, count) => if (blackList.value.contains(word)) { accumulator.add(count) false } else { true } }.collect().mkString("[", ", ", "]") println(s"str = $str") }) } } object WordBlackList { @volatile private var instance: Broadcast[Seq[String]] = _ def getInstance(context: SparkContext): Broadcast[Seq[String]] = { if (instance == null) { synchronized { if (instance == null) { val blackList = Seq("a", "b", "c") instance = context.broadcast(blackList) } } } instance } } object DropWordCounter { @volatile private var instance: LongAccumulator = _ def getInstance(context: SparkContext): LongAccumulator = { if (instance == null) { synchronized { if (instance == null) { instance = context.longAccumulator("WordCount") } } } instance } }
Example 125
Source File: DStreamMatcher.scala From piglet with Apache License 2.0 | 5 votes |
package dbis.piglet.cep.spark import org.apache.spark.rdd.RDD import org.apache.spark.streaming.dstream._ import org.apache.spark.streaming.Time import scala.reflect.ClassTag import dbis.piglet.cep.ops.SelectionStrategy._ import dbis.piglet.cep.ops.OutputStrategy._ import dbis.piglet.cep.nfa.NFAController import dbis.piglet.cep.engines._ import dbis.piglet.backends.{SchemaClass => Event} import dbis.piglet.cep.ops.MatchCollector import dbis.piglet.cep.ops.SelectionStrategy class DStreamMatcher[T <: Event: ClassTag](parent: DStream[T], nfa: NFAController[T], sstr: SelectionStrategy = SelectionStrategy.FirstMatch, out: OutputStrategy = Combined) extends DStream[T](parent.context) { val collector: MatchCollector[T] = new MatchCollector() val engine: CEPEngine[T] = sstr match { case SelectionStrategy.FirstMatch => new FirstMatch(nfa, collector) case SelectionStrategy.AllMatches => new AnyMatch(nfa, collector) case SelectionStrategy.NextMatches => new NextMatch(nfa, collector) case SelectionStrategy.ContiguityMatches => new ContiguityMatch(nfa, collector) case _ => throw new Exception("The Strategy is not supported") } override def compute(validTime: Time): Option[RDD[T]] = { println("The matcher receive an event") parent.compute(validTime) match { case Some(rdd) => rdd.foreach(event => engine.runEngine(event)) case None => null } parent.compute(validTime) //val data = Array(new SalesRecord("1","2","3",4)) //val x = Some(parent.context.sparkContext.parallelize(data)) //println(x.get.collect().toList) //x } }