org.apache.spark.scheduler.SparkListener Scala Examples
The following examples show how to use org.apache.spark.scheduler.SparkListener.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CleanupUtil.scala From hazelcast-spark with Apache License 2.0 | 7 votes |
package com.hazelcast.spark.connector.util import com.hazelcast.spark.connector.util.ConnectionUtil.closeAll import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerJobStart} object CleanupUtil { val jobIds: collection.mutable.Map[Int, Seq[Int]] = collection.mutable.Map[Int, Seq[Int]]() val cleanupJobRddName: String = "HazelcastResourceCleanupJob" def addCleanupListener(sc: SparkContext): Unit = { sc.addSparkListener(new SparkListener { override def onJobStart(jobStart: SparkListenerJobStart): Unit = { this.synchronized { jobStart.stageInfos.foreach(info => { info.rddInfos.foreach(rdd => { if (!cleanupJobRddName.equals(rdd.name)) { val ids: Seq[Int] = info.rddInfos.map(_.id) val maybeIds: Option[Seq[Int]] = jobIds.get(jobStart.jobId) if (maybeIds.isDefined) { jobIds.put(jobStart.jobId, ids ++ maybeIds.get) } else { jobIds.put(jobStart.jobId, ids) } } }) }) } } override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { this.synchronized { if (jobIds.contains(jobEnd.jobId)) { try { val workers = sc.getConf.getInt("spark.executor.instances", sc.getExecutorStorageStatus.length) val rddId: Option[Seq[Int]] = jobIds.get(jobEnd.jobId) if (rddId.isDefined) { sc.parallelize(1 to workers, workers).setName(cleanupJobRddName).foreachPartition(it ⇒ closeAll(rddId.get)) } jobIds -= jobEnd.jobId } catch { case e: Exception => } } } } }) } }
Example 2
Source File: OapListener.scala From OAP with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.oap.listener import org.apache.spark.annotation.DeveloperApi import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.oap.OapRuntime @DeveloperApi case class SparkListenerCustomInfoUpdate( hostName: String, executorId: String, clazzName: String, customizedInfo: String) extends SparkListenerEvent { override def logEvent: Boolean = false } class OapListener extends SparkListener { override def onOtherEvent(event: SparkListenerEvent): Unit = event match { case customInfo: SparkListenerCustomInfoUpdate => if (customInfo.clazzName.contains("OapFiberCacheHeartBeatMessager")) { OapRuntime.getOrCreate.fiberSensor.updateLocations(customInfo) } else if (customInfo.clazzName.contains("FiberCacheManagerMessager")) { OapRuntime.getOrCreate.fiberSensor.updateMetrics(customInfo) } case _ => } }
Example 3
Source File: ProfilerListener.scala From carbondata with Apache License 2.0 | 6 votes |
package org.apache.spark.sql.profiler import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent, SparkListenerTaskEnd, SparkListenerTaskGettingResult, SparkListenerTaskStart} import org.apache.spark.sql.execution.ui.{SparkListenerSQLExecutionEnd, SparkListenerSQLExecutionStart} private[profiler] class ProfilerListener extends SparkListener { override def onOtherEvent(event: SparkListenerEvent): Unit = { Profiler.invokeIfEnable { event match { case executionStart: SparkListenerSQLExecutionStart => Profiler.addExecutionMessage( executionStart.executionId, ExecutionStart( executionStart.executionId, executionStart.time, executionStart.physicalPlanDescription )) case executionEnd: SparkListenerSQLExecutionEnd => Profiler.send( ExecutionEnd( executionEnd.executionId, executionEnd.time ) ) case _ => } } } }
Example 4
Source File: OutputMetricsTest.scala From memsql-spark-connector with Apache License 2.0 | 5 votes |
package com.memsql.spark import com.github.mrpowers.spark.daria.sql.SparkSessionExt._ import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.types.{IntegerType, StringType} class OutputMetricsTest extends IntegrationSuiteBase { it("records written") { var outputWritten = 0L spark.sparkContext.addSparkListener(new SparkListener() { override def onTaskEnd(taskEnd: SparkListenerTaskEnd) { val metrics = taskEnd.taskMetrics outputWritten += metrics.outputMetrics.recordsWritten } }) val numRows = 100000 val df1 = spark.createDF( List.range(0, numRows), List(("id", IntegerType, true)) ) df1.repartition(30) df1.write .format("memsql") .save("metricsInts") assert(outputWritten == numRows) outputWritten = 0 val df2 = spark.createDF( List("st1", "", null), List(("st", StringType, true)) ) df2.write .format("memsql") .save("metricsStrings") assert(outputWritten == 3) } }
Example 5
Source File: JobProcessor.scala From sparkplug with MIT License | 5 votes |
package springnz.sparkplug.executor import akka.actor._ import org.apache.spark.SparkContext import org.apache.spark.scheduler.SparkListener import springnz.sparkplug.core.SparkPlugin import springnz.sparkplug.executor.InternalMessageTypes.RoutedRequest import springnz.sparkplug.executor.MessageTypes.{ JobFailure, JobRequest, JobSuccess } import springnz.sparkplug.util.Logging import scala.concurrent.Future import scala.util._ object JobProcessor { // Uses it's own dispatcher for bulkheading def props(implicit sparkContext: SparkContext): Props = Props(new JobProcessor).withDispatcher("akka.sparkplug-job-dispatcher") } class JobProcessor(implicit sparkContext: SparkContext) extends Actor with Logging { def executeJob(job: JobRequest, originator: ActorRef) = { val factoryName = job.factoryClassName implicit val ec = context.dispatcher val f: Future[Any] = Future { log.info(s"Loading and instantiating job '$factoryName'.") val factoryAny = Class.forName(factoryName).newInstance() val operation = factoryAny.asInstanceOf[SparkPlugin].apply(job.data) log.info(s"Executing job '$factoryName'.") val result = operation.run(sparkContext) log.info(s"Job '$factoryName' finished.") // TODO: do something about the SparkListener (not sure what) val listener = new SparkListener {} sparkContext.addSparkListener(listener) result } f.onComplete { case Success(result) ⇒ log.info(s"Successfully processed job $job. Notifying client.") log.debug(s"Job result: $result") originator ! JobSuccess(job, result) case Failure(reason) ⇒ log.error(s"Job processing of job $job failed. Notifying client", reason) originator ! JobFailure(job, reason) } } def receive = { case RoutedRequest(job, originator: ActorRef) ⇒ log.info(s"JobProcessor ${self.path.toString} routed request to process job ${job}") executeJob(job, originator) } }
Example 6
Source File: PulsarContinuousTest.scala From pulsar-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.pulsar import java.util.concurrent.atomic.AtomicInteger import scala.language.reflectiveCalls import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.test.TestSparkSession trait PulsarContinuousTest extends PulsarSourceTest { override val defaultTrigger = Trigger.Continuous(1000) override val defaultUseV2Sink = true // We need more than the default local[2] to be able to schedule all partitions simultaneously. override protected def createSparkSession = new TestSparkSession( new SparkContext( "local[10]", "continuous-stream-test-sql-context", sparkConf.set("spark.sql.testkey", "true"))) // Continuous processing tasks end asynchronously, so test that they actually end. private val tasksEndedListener = new SparkListener() { val activeTaskIdCount = new AtomicInteger(0) override def onTaskStart(start: SparkListenerTaskStart): Unit = { activeTaskIdCount.incrementAndGet() } override def onTaskEnd(end: SparkListenerTaskEnd): Unit = { activeTaskIdCount.decrementAndGet() } } override def beforeEach(): Unit = { super.beforeEach() spark.sparkContext.addSparkListener(tasksEndedListener) } override def afterEach(): Unit = { eventually(timeout(streamingTimeout)) { assert(tasksEndedListener.activeTaskIdCount.get() == 0) } spark.sparkContext.removeSparkListener(tasksEndedListener) super.afterEach() } test("ensure continuous stream is being used") { val query = spark.readStream .format("rate") .option("numPartitions", "1") .option("rowsPerSecond", "1") .load() testStream(query)( Execute(q => assert(q.isInstanceOf[ContinuousExecution])) ) } }
Example 7
Source File: QueryGuardListener.scala From gimel with Apache License 2.0 | 5 votes |
package com.paypal.gimel.common.query.guard import java.util.concurrent.atomic.AtomicBoolean import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd, SparkListenerJobStart, SparkListenerStageCompleted} import org.apache.spark.sql.SparkSession import org.joda.time.{DateTime, Instant} import com.paypal.gimel.common.conf.{QueryGuardConfigs, QueryGuardConstants} import com.paypal.gimel.common.utilities.GenericUtils import com.paypal.gimel.logger.Logger class QueryGuardListener[E >: QueryGuardDelayedEvent](spark: SparkSession, discoveryType: String = "job") extends SparkListener with Producer[E] { private val logger = new Logger(this.getClass.getName) private val stopped = new AtomicBoolean(true) private val HEADER: String = "[DISCOVERY] " private var _consumers: Seq[Consumer[E]] = Seq.empty override def onJobStart(jobStart: SparkListenerJobStart) { logger.info( s"${HEADER}Job[${jobStart.jobId}] started with ${jobStart.stageInfos.size} stages @ ${Instant.now()}" ) if (!stopped.get) { val job = JobSubmitted( jobStart.jobId, discoveryType, System.currentTimeMillis(), jobTtl = GenericUtils.getValue( spark.conf, QueryGuardConfigs.JOB_TTL, QueryGuardConstants.DEFAULT_JOB_TTL ), delayTtl = GenericUtils.getValue( spark.conf, QueryGuardConfigs.DELAY_TTL, QueryGuardConstants.DEFAULT_DELAY_TTL ) ) logger.info( s"${HEADER}Proceeding to queue in Job[${jobStart.jobId}] onto QueryGuard" ) publish(job) } else { logger.info( s"${HEADER}As QueryGuardListener is ${stopped.get()}," + s" unable to queue in Job[${jobStart.jobId}]" ) } } override def publish(queryGuardEvent: E): Unit = { for (consumer <- _consumers) { consumer.consume(queryGuardEvent) } } override def onStageCompleted( stageCompleted: SparkListenerStageCompleted ): Unit = { logger.info( s"Stage ${stageCompleted.stageInfo.stageId} completed with ${stageCompleted.stageInfo.numTasks} tasks." ) } override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = { logger.info( s"Job[${jobEnd.jobId}] completed at ${new DateTime(jobEnd.time)}" + s" with result -> ${jobEnd.jobResult}" ) super.onJobEnd(jobEnd) } override def registerConsumers(consumers: Seq[Consumer[E]]): Unit = { _consumers = consumers } def start(): Unit = { // toggle stopped to true stopped.set(false) logger.info(s"${HEADER}Started QueryGuardListener: $stopped") } def stop(): Unit = { // toggle stopped to true stopped.compareAndSet(false, true) logger.info(s"${HEADER}Stopped QueryGuardListener: $stopped") } }
Example 8
Source File: package.scala From infinispan-spark with Apache License 2.0 | 5 votes |
package org.infinispan.spark import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd} import org.apache.spark.streaming.dstream.DStream import org.infinispan.client.hotrod.RemoteCacheManager import org.infinispan.spark.config.ConnectorConfiguration import org.infinispan.spark.rdd.RemoteCacheManagerBuilder package object stream { implicit class InfinispanDStream[K, V](stream: DStream[(K, V)]) { private def getCacheManager(configuration: ConnectorConfiguration): RemoteCacheManager = { val rcm = RemoteCacheManagerBuilder.create(configuration) stream.context.sparkContext.addSparkListener(new SparkListener { override def onJobEnd(jobEnd: SparkListenerJobEnd) = rcm.stop() }) rcm } def writeToInfinispan(configuration: ConnectorConfiguration) = { val rcm = getCacheManager(configuration) val cache = getCache(configuration, rcm) val topologyConfig = getCacheTopology(cache.getCacheTopologyInfo) configuration.setServerList(topologyConfig) stream.foreachRDD(_.writeToInfinispan(configuration)) } } }
Example 9
Source File: LogUrlsStandaloneSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.URL import scala.collection.mutable import scala.io.Source import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.scheduler.{SparkListenerExecutorAdded, SparkListener} import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.util.SparkConfWithEnv class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { private val WAIT_TIMEOUT_MILLIS = 10000 test("verify that correct log urls get propagated from workers") { sc = new SparkContext("local-cluster[2,1,1024]", "test") val listener = new SaveExecutorInfo sc.addSparkListener(listener) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) // Browse to each URL to check that it's valid info.logUrlMap.foreach { case (logType, logUrl) => val html = Source.fromURL(logUrl).mkString assert(html.contains(s"$logType log page")) } } } test("verify that log urls reflect SPARK_PUBLIC_DNS (SPARK-6175)") { val SPARK_PUBLIC_DNS = "public_dns" val conf = new SparkConfWithEnv(Map("SPARK_PUBLIC_DNS" -> SPARK_PUBLIC_DNS)).set( "spark.extraListeners", classOf[SaveExecutorInfo].getName) sc = new SparkContext("local-cluster[2,1,1024]", "test", conf) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) val listener = listeners(0) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) info.logUrlMap.values.foreach { logUrl => assert(new URL(logUrl).getHost === SPARK_PUBLIC_DNS) } } } } private[spark] class SaveExecutorInfo extends SparkListener { val addedExecutorInfos = mutable.Map[String, ExecutorInfo]() override def onExecutorAdded(executor: SparkListenerExecutorAdded) { addedExecutorInfos(executor.executorId) = executor.executorInfo } }
Example 10
Source File: KinesisContinuousTest.scala From kinesis-sql with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kinesis import java.util.concurrent.atomic.AtomicInteger import org.scalatest.time.SpanSugar._ import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.test.TestSparkSession trait KinesisContinuousTest extends KinesisSourceTest{ override val defaultTrigger = Trigger.Continuous("1 hour") override val defaultUseV2Sink = true override val streamingTimeout = 120.seconds override protected def createSparkSession = new TestSparkSession( new SparkContext( "local[10]", "continuous-stream-test-sql-context", sparkConf.set("spark.sql.testkey", "true"))) // Continuous processing tasks end asynchronously, so test that they actually end. private val tasksEndedListener = new SparkListener() { val activeTaskIdCount = new AtomicInteger(0) override def onTaskStart(start: SparkListenerTaskStart): Unit = { activeTaskIdCount.incrementAndGet() } override def onTaskEnd(end: SparkListenerTaskEnd): Unit = { activeTaskIdCount.decrementAndGet() } } override def beforeEach(): Unit = { super.beforeEach() spark.sparkContext.addSparkListener(tasksEndedListener) } override def afterEach(): Unit = { eventually(timeout(streamingTimeout)) { assert(tasksEndedListener.activeTaskIdCount.get() == 0) } spark.sparkContext.removeSparkListener(tasksEndedListener) super.afterEach() } }
Example 11
Source File: LogUrlsStandaloneSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.URL import scala.collection.mutable import scala.io.Source import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorAdded} import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.util.SparkConfWithEnv class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { private val WAIT_TIMEOUT_MILLIS = 10000 test("verify that correct log urls get propagated from workers") { sc = new SparkContext("local-cluster[2,1,1024]", "test") val listener = new SaveExecutorInfo sc.addSparkListener(listener) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) // Browse to each URL to check that it's valid info.logUrlMap.foreach { case (logType, logUrl) => val html = Source.fromURL(logUrl).mkString assert(html.contains(s"$logType log page")) } } } test("verify that log urls reflect SPARK_PUBLIC_DNS (SPARK-6175)") { val SPARK_PUBLIC_DNS = "public_dns" val conf = new SparkConfWithEnv(Map("SPARK_PUBLIC_DNS" -> SPARK_PUBLIC_DNS)).set( "spark.extraListeners", classOf[SaveExecutorInfo].getName) sc = new SparkContext("local-cluster[2,1,1024]", "test", conf) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) val listener = listeners(0) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) info.logUrlMap.values.foreach { logUrl => assert(new URL(logUrl).getHost === SPARK_PUBLIC_DNS) } } } } private[spark] class SaveExecutorInfo extends SparkListener { val addedExecutorInfos = mutable.Map[String, ExecutorInfo]() override def onExecutorAdded(executor: SparkListenerExecutorAdded) { addedExecutorInfos(executor.executorId) = executor.executorInfo } }
Example 12
Source File: SQLHistoryServerPlugin.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.ui import org.apache.spark.SparkConf import org.apache.spark.scheduler.SparkListener import org.apache.spark.status.{AppHistoryServerPlugin, ElementTrackingStore} import org.apache.spark.ui.SparkUI class SQLHistoryServerPlugin extends AppHistoryServerPlugin { override def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener] = { Seq(new SQLAppStatusListener(conf, store, live = false)) } override def setupUI(ui: SparkUI): Unit = { val sqlStatusStore = new SQLAppStatusStore(ui.store.store) if (sqlStatusStore.executionsCount() > 0) { new SQLTab(sqlStatusStore, ui) } } }
Example 13
Source File: KafkaContinuousTest.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.kafka010 import java.util.concurrent.atomic.AtomicInteger import org.apache.spark.SparkContext import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd, SparkListenerTaskStart} import org.apache.spark.sql.execution.datasources.v2.DataSourceV2Relation import org.apache.spark.sql.execution.streaming.StreamExecution import org.apache.spark.sql.execution.streaming.continuous.ContinuousExecution import org.apache.spark.sql.streaming.Trigger import org.apache.spark.sql.test.TestSparkSession // Trait to configure StreamTest for kafka continuous execution tests. trait KafkaContinuousTest extends KafkaSourceTest { override val defaultTrigger = Trigger.Continuous(1000) override val defaultUseV2Sink = true // We need more than the default local[2] to be able to schedule all partitions simultaneously. override protected def createSparkSession = new TestSparkSession( new SparkContext( "local[10]", "continuous-stream-test-sql-context", sparkConf.set("spark.sql.testkey", "true"))) // In addition to setting the partitions in Kafka, we have to wait until the query has // reconfigured to the new count so the test framework can hook in properly. override protected def setTopicPartitions( topic: String, newCount: Int, query: StreamExecution) = { testUtils.addPartitions(topic, newCount) eventually(timeout(streamingTimeout)) { assert( query.lastExecution.logical.collectFirst { case DataSourceV2Relation(_, r: KafkaContinuousReader) => r }.exists(_.knownPartitions.size == newCount), s"query never reconfigured to $newCount partitions") } } // Continuous processing tasks end asynchronously, so test that they actually end. private val tasksEndedListener = new SparkListener() { val activeTaskIdCount = new AtomicInteger(0) override def onTaskStart(start: SparkListenerTaskStart): Unit = { activeTaskIdCount.incrementAndGet() } override def onTaskEnd(end: SparkListenerTaskEnd): Unit = { activeTaskIdCount.decrementAndGet() } } override def beforeEach(): Unit = { super.beforeEach() spark.sparkContext.addSparkListener(tasksEndedListener) } override def afterEach(): Unit = { eventually(timeout(streamingTimeout)) { assert(tasksEndedListener.activeTaskIdCount.get() == 0) } spark.sparkContext.removeSparkListener(tasksEndedListener) super.afterEach() } test("ensure continuous stream is being used") { val query = spark.readStream .format("rate") .option("numPartitions", "1") .option("rowsPerSecond", "1") .load() testStream(query)( Execute(q => assert(q.isInstanceOf[ContinuousExecution])) ) } }
Example 14
Source File: LogUrlsStandaloneSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.URL import scala.collection.JavaConversions._ import scala.collection.mutable import scala.io.Source import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.scheduler.{SparkListenerExecutorAdded, SparkListener} import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { private val WAIT_TIMEOUT_MILLIS = 10000 test("verify that correct log urls get propagated from workers") { sc = new SparkContext("local-cluster[2,1,512]", "test") val listener = new SaveExecutorInfo sc.addSparkListener(listener) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) // Browse to each URL to check that it's valid info.logUrlMap.foreach { case (logType, logUrl) => val html = Source.fromURL(logUrl).mkString assert(html.contains(s"$logType log page")) } } } test("verify that log urls reflect SPARK_PUBLIC_DNS (SPARK-6175)") { val SPARK_PUBLIC_DNS = "public_dns" class MySparkConf extends SparkConf(false) { override def getenv(name: String): String = { if (name == "SPARK_PUBLIC_DNS") SPARK_PUBLIC_DNS else super.getenv(name) } override def clone: SparkConf = { new MySparkConf().setAll(getAll) } } val conf = new MySparkConf().set( "spark.extraListeners", classOf[SaveExecutorInfo].getName) sc = new SparkContext("local-cluster[2,1,512]", "test", conf) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) val listener = listeners(0) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) info.logUrlMap.values.foreach { logUrl => assert(new URL(logUrl).getHost === SPARK_PUBLIC_DNS) } } } } private[spark] class SaveExecutorInfo extends SparkListener { val addedExecutorInfos = mutable.Map[String, ExecutorInfo]() override def onExecutorAdded(executor: SparkListenerExecutorAdded) { addedExecutorInfos(executor.executorId) = executor.executorInfo } }
Example 15
Source File: LogUrlsStandaloneSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.URL import scala.collection.mutable import scala.io.Source import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorAdded} import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.util.SparkConfWithEnv class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { private val WAIT_TIMEOUT_MILLIS = 10000 test("verify that correct log urls get propagated from workers") { sc = new SparkContext("local-cluster[2,1,1024]", "test") val listener = new SaveExecutorInfo sc.addSparkListener(listener) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) // Browse to each URL to check that it's valid info.logUrlMap.foreach { case (logType, logUrl) => val html = Source.fromURL(logUrl).mkString assert(html.contains(s"$logType log page")) } } } test("verify that log urls reflect SPARK_PUBLIC_DNS (SPARK-6175)") { val SPARK_PUBLIC_DNS = "public_dns" val conf = new SparkConfWithEnv(Map("SPARK_PUBLIC_DNS" -> SPARK_PUBLIC_DNS)).set( "spark.extraListeners", classOf[SaveExecutorInfo].getName) sc = new SparkContext("local-cluster[2,1,1024]", "test", conf) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) val listener = listeners(0) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) info.logUrlMap.values.foreach { logUrl => assert(new URL(logUrl).getHost === SPARK_PUBLIC_DNS) } } } } private[spark] class SaveExecutorInfo extends SparkListener { val addedExecutorInfos = mutable.Map[String, ExecutorInfo]() override def onExecutorAdded(executor: SparkListenerExecutorAdded) { addedExecutorInfos(executor.executorId) = executor.executorInfo } }
Example 16
Source File: MergeIntoAccumulatorSuite.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta import java.util.concurrent.atomic.AtomicReference import scala.collection.JavaConverters._ import org.apache.spark.sql.delta.commands.MergeIntoCommand import org.apache.spark.sql.delta.test.DeltaSQLCommandTest import org.apache.spark.scheduler.{SparkListener, SparkListenerTaskEnd} import org.apache.spark.sql.QueryTest import org.apache.spark.sql.test.SharedSparkSession import org.apache.spark.status.TaskDataWrapper import org.apache.spark.util.JsonProtocol class MergeIntoAccumulatorSuite extends QueryTest with SharedSparkSession with DeltaSQLCommandTest { import testImplicits._ private def runTestMergeCommand(): Unit = { // Run a simple merge command withTempView("source") { withTempDir { tempDir => val tempPath = tempDir.getCanonicalPath Seq((1, 1), (0, 3)).toDF("key", "value").createOrReplaceTempView("source") Seq((2, 2), (1, 4)).toDF("key", "value").write.format("delta").save(tempPath) spark.sql(s""" |MERGE INTO delta.`$tempPath` target |USING source src |ON src.key = target.key |WHEN MATCHED THEN UPDATE SET * |WHEN NOT MATCHED THEN INSERT * |""".stripMargin) } } } test("accumulators used by MERGE should not be tracked by Spark UI") { runTestMergeCommand() // Make sure all Spark events generated by the above command have been processed spark.sparkContext.listenerBus.waitUntilEmpty(30000) val store = spark.sparkContext.statusStore.store val iter = store.view(classOf[TaskDataWrapper]).closeableIterator() try { // Collect all accumulator names tracked by Spark UI. val accumNames = iter.asScala.toVector.flatMap { task => task.accumulatorUpdates.map(_.name) }.toSet // Verify accumulators used by MergeIntoCommand are not tracked. assert(!accumNames.contains(MergeIntoCommand.TOUCHED_FILES_ACCUM_NAME)) } finally { iter.close() } } }
Example 17
Source File: SparkSqlAdapter.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} import org.apache.spark.sql.catalyst.TableIdentifier import org.apache.spark.sql.catalyst.expressions.{Attribute, Expression} import org.apache.spark.sql.execution.FileSourceScanExec import org.apache.spark.sql.execution.datasources.HadoopFsRelation import org.apache.spark.sql.types.StructType import org.apache.carbondata.core.util.ThreadLocalSessionInfo object SparkSqlAdapter { def initSparkSQL(): Unit = { } def getScanForSegments( @transient relation: HadoopFsRelation, output: Seq[Attribute], outputSchema: StructType, partitionFilters: Seq[Expression], dataFilters: Seq[Expression], tableIdentifier: Option[TableIdentifier] ): FileSourceScanExec = { FileSourceScanExec( relation, output, outputSchema, partitionFilters, dataFilters, tableIdentifier) } def addSparkSessionListener(sparkSession: SparkSession): Unit = { sparkSession.sparkContext.addSparkListener(new SparkListener { override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { CarbonEnv.carbonEnvMap.remove(sparkSession) ThreadLocalSessionInfo.unsetAll() } }) } }
Example 18
Source File: MVManagerInSpark.scala From carbondata with Apache License 2.0 | 5 votes |
package org.apache.carbondata.view import java.util import org.apache.spark.scheduler.{SparkListener, SparkListenerApplicationEnd} import org.apache.spark.sql.{CarbonEnv, CarbonUtils, SparkSession} import org.apache.carbondata.core.constants.CarbonCommonConstants import org.apache.carbondata.core.util.ThreadLocalSessionInfo import org.apache.carbondata.core.view.MVManager class MVManagerInSpark(session: SparkSession) extends MVManager { override def getDatabases: util.List[String] = { CarbonUtils.threadSet(CarbonCommonConstants.DISABLE_SQL_REWRITE, "true") try { val databaseList = session.catalog.listDatabases() val databaseNameList = new util.ArrayList[String]() for (database <- databaseList.collect()) { databaseNameList.add(database.name) } databaseNameList } finally { CarbonUtils.threadUnset(CarbonCommonConstants.DISABLE_SQL_REWRITE) } } override def getDatabaseLocation(databaseName: String): String = { CarbonEnv.getDatabaseLocation(databaseName, session) } } object MVManagerInSpark { private val MANAGER_MAP_BY_SESSION = new util.HashMap[SparkSession, MVManagerInSpark]() def get(session: SparkSession): MVManagerInSpark = { var viewManager = MANAGER_MAP_BY_SESSION.get(session) if (viewManager == null) { MANAGER_MAP_BY_SESSION.synchronized { viewManager = MANAGER_MAP_BY_SESSION.get(session) if (viewManager == null) { viewManager = new MVManagerInSpark(session) MANAGER_MAP_BY_SESSION.put(session, viewManager) session.sparkContext.addSparkListener(new SparkListener { override def onApplicationEnd(applicationEnd: SparkListenerApplicationEnd): Unit = { CarbonEnv.carbonEnvMap.remove(session) ThreadLocalSessionInfo.unsetAll() } }) } } } viewManager } }
Example 19
Source File: LogUrlsStandaloneSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.URL import scala.collection.mutable import scala.io.Source import org.scalatest.FunSuite import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.scheduler.{SparkListenerExecutorAdded, SparkListener} import org.apache.spark.{SparkConf, SparkContext, LocalSparkContext} class LogUrlsStandaloneSuite extends FunSuite with LocalSparkContext { private val WAIT_TIMEOUT_MILLIS = 10000 test("verify that correct log urls get propagated from workers") { sc = new SparkContext("local-cluster[2,1,512]", "test") val listener = new SaveExecutorInfo sc.addSparkListener(listener) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) // Browse to each URL to check that it's valid info.logUrlMap.foreach { case (logType, logUrl) => val html = Source.fromURL(logUrl).mkString assert(html.contains(s"$logType log page")) } } } test("verify that log urls reflect SPARK_PUBLIC_DNS (SPARK-6175)") { val SPARK_PUBLIC_DNS = "public_dns" class MySparkConf extends SparkConf(false) { override def getenv(name: String) = { if (name == "SPARK_PUBLIC_DNS") SPARK_PUBLIC_DNS else super.getenv(name) } override def clone: SparkConf = { new MySparkConf().setAll(getAll) } } val conf = new MySparkConf() sc = new SparkContext("local-cluster[2,1,512]", "test", conf) val listener = new SaveExecutorInfo sc.addSparkListener(listener) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() assert(sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS)) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) info.logUrlMap.values.foreach { logUrl => assert(new URL(logUrl).getHost === SPARK_PUBLIC_DNS) } } } private class SaveExecutorInfo extends SparkListener { val addedExecutorInfos = mutable.Map[String, ExecutorInfo]() override def onExecutorAdded(executor: SparkListenerExecutorAdded) { addedExecutorInfos(executor.executorId) = executor.executorInfo } } }
Example 20
Source File: LogUrlsStandaloneSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.URL import scala.collection.mutable import scala.io.Source import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorAdded} import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.util.SparkConfWithEnv class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { private val WAIT_TIMEOUT_MILLIS = 10000 test("verify that correct log urls get propagated from workers") { sc = new SparkContext("local-cluster[2,1,1024]", "test") val listener = new SaveExecutorInfo sc.addSparkListener(listener) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) // Browse to each URL to check that it's valid info.logUrlMap.foreach { case (logType, logUrl) => val html = Source.fromURL(logUrl).mkString assert(html.contains(s"$logType log page")) } } } test("verify that log urls reflect SPARK_PUBLIC_DNS (SPARK-6175)") { val SPARK_PUBLIC_DNS = "public_dns" val conf = new SparkConfWithEnv(Map("SPARK_PUBLIC_DNS" -> SPARK_PUBLIC_DNS)).set( "spark.extraListeners", classOf[SaveExecutorInfo].getName) sc = new SparkContext("local-cluster[2,1,1024]", "test", conf) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) val listener = listeners(0) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) info.logUrlMap.values.foreach { logUrl => assert(new URL(logUrl).getHost === SPARK_PUBLIC_DNS) } } } } private[spark] class SaveExecutorInfo extends SparkListener { val addedExecutorInfos = mutable.Map[String, ExecutorInfo]() override def onExecutorAdded(executor: SparkListenerExecutorAdded) { addedExecutorInfos(executor.executorId) = executor.executorInfo } }
Example 21
Source File: SparkAtlasEventTracker.scala From spark-atlas-connector with Apache License 2.0 | 5 votes |
package com.hortonworks.spark.atlas import com.google.common.annotations.VisibleForTesting import org.apache.spark.scheduler.{SparkListener, SparkListenerEvent} import org.apache.spark.sql.catalyst.catalog.ExternalCatalogEvent import org.apache.spark.sql.execution.QueryExecution import org.apache.spark.sql.util.QueryExecutionListener import com.hortonworks.spark.atlas.sql._ import com.hortonworks.spark.atlas.ml.MLPipelineEventProcessor import com.hortonworks.spark.atlas.utils.Logging class SparkAtlasEventTracker(atlasClient: AtlasClient, atlasClientConf: AtlasClientConf) extends SparkListener with QueryExecutionListener with Logging { def this(atlasClientConf: AtlasClientConf) = { this(AtlasClient.atlasClient(atlasClientConf), atlasClientConf) } def this() { this(new AtlasClientConf) } private val enabled: Boolean = AtlasUtils.isSacEnabled(atlasClientConf) // Processor to handle DDL related events @VisibleForTesting private[atlas] val catalogEventTracker = new SparkCatalogEventProcessor(atlasClient, atlasClientConf) catalogEventTracker.startThread() // Processor to handle DML related events private val executionPlanTracker = new SparkExecutionPlanProcessor(atlasClient, atlasClientConf) executionPlanTracker.startThread() private val mlEventTracker = new MLPipelineEventProcessor(atlasClient, atlasClientConf) mlEventTracker.startThread() override def onOtherEvent(event: SparkListenerEvent): Unit = { if (!enabled) { // No op if SAC is disabled return } // We only care about SQL related events. event match { case e: ExternalCatalogEvent => catalogEventTracker.pushEvent(e) case e: SparkListenerEvent if e.getClass.getName.contains("org.apache.spark.ml") => mlEventTracker.pushEvent(e) case _ => // Ignore other events } } override def onSuccess(funcName: String, qe: QueryExecution, durationNs: Long): Unit = { if (!enabled) { // No op if SAC is disabled return } if (qe.logical.isStreaming) { // streaming query will be tracked via SparkAtlasStreamingQueryEventTracker return } val qd = QueryDetail.fromQueryExecutionListener(qe, durationNs) executionPlanTracker.pushEvent(qd) } override def onFailure(funcName: String, qe: QueryExecution, exception: Exception): Unit = { // No-op: SAC is one of the listener. } }
Example 22
Source File: SQLHistoryServerPlugin.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.ui import org.apache.spark.SparkConf import org.apache.spark.scheduler.SparkListener import org.apache.spark.status.{AppHistoryServerPlugin, ElementTrackingStore} import org.apache.spark.ui.SparkUI class SQLHistoryServerPlugin extends AppHistoryServerPlugin { override def createListeners(conf: SparkConf, store: ElementTrackingStore): Seq[SparkListener] = { Seq(new SQLAppStatusListener(conf, store, live = false)) } override def setupUI(ui: SparkUI): Unit = { val sqlStatusStore = new SQLAppStatusStore(ui.store.store) if (sqlStatusStore.executionsCount() > 0) { new SQLTab(sqlStatusStore, ui) } } }
Example 23
Source File: PDCacheInvalidateListener.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.listener import java.util.logging.Logger import com.pingcap.tispark.accumulator.CacheInvalidateAccumulator import com.pingcap.tispark.handler.CacheInvalidateEventHandler import org.apache.spark.scheduler.{SparkListener, SparkListenerJobEnd} class PDCacheInvalidateListener( accumulator: CacheInvalidateAccumulator, handler: CacheInvalidateEventHandler) extends SparkListener { private final val logger: Logger = Logger.getLogger(getClass.getName) override def onJobEnd(jobEnd: SparkListenerJobEnd): Unit = if (accumulator != null && !accumulator.isZero && handler != null) { synchronized { if (!accumulator.isZero) { val events = accumulator.value logger.info( s"Receiving ${events.size} cache invalidation request(s) from job ${jobEnd.jobId} at driver. " + s"This indicates that there's exception(s) thrown in executor node when communicating with " + s"TiKV, checkout executors' log for more information.") events.foreach(handler.handle) } } } }
Example 24
Source File: LogUrlsStandaloneSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy import java.net.URL import scala.collection.mutable import scala.io.Source import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.{SparkListener, SparkListenerExecutorAdded} import org.apache.spark.scheduler.cluster.ExecutorInfo import org.apache.spark.util.SparkConfWithEnv class LogUrlsStandaloneSuite extends SparkFunSuite with LocalSparkContext { private val WAIT_TIMEOUT_MILLIS = 10000 test("verify that correct log urls get propagated from workers") { sc = new SparkContext("local-cluster[2,1,1024]", "test") val listener = new SaveExecutorInfo sc.addSparkListener(listener) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) // Browse to each URL to check that it's valid info.logUrlMap.foreach { case (logType, logUrl) => val html = Source.fromURL(logUrl).mkString assert(html.contains(s"$logType log page")) } } } test("verify that log urls reflect SPARK_PUBLIC_DNS (SPARK-6175)") { val SPARK_PUBLIC_DNS = "public_dns" val conf = new SparkConfWithEnv(Map("SPARK_PUBLIC_DNS" -> SPARK_PUBLIC_DNS)).set( "spark.extraListeners", classOf[SaveExecutorInfo].getName) sc = new SparkContext("local-cluster[2,1,1024]", "test", conf) // Trigger a job so that executors get added sc.parallelize(1 to 100, 4).map(_.toString).count() sc.listenerBus.waitUntilEmpty(WAIT_TIMEOUT_MILLIS) val listeners = sc.listenerBus.findListenersByClass[SaveExecutorInfo] assert(listeners.size === 1) val listener = listeners(0) listener.addedExecutorInfos.values.foreach { info => assert(info.logUrlMap.nonEmpty) info.logUrlMap.values.foreach { logUrl => assert(new URL(logUrl).getHost === SPARK_PUBLIC_DNS) } } } } private[spark] class SaveExecutorInfo extends SparkListener { val addedExecutorInfos = mutable.Map[String, ExecutorInfo]() override def onExecutorAdded(executor: SparkListenerExecutorAdded) { addedExecutorInfos(executor.executorId) = executor.executorInfo } }
Example 25
Source File: StreamingQueryListenerBus.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerEvent} import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.spark.util.ListenerBus def post(event: StreamingQueryListener.Event) { event match { case s: QueryStartedEvent => postToAll(s) case _ => sparkListenerBus.post(event) } } override def onOtherEvent(event: SparkListenerEvent): Unit = { event match { case e: StreamingQueryListener.Event => postToAll(e) case _ => } } override protected def doPostEvent( listener: StreamingQueryListener, event: StreamingQueryListener.Event): Unit = { event match { case queryStarted: QueryStartedEvent => listener.onQueryStarted(queryStarted) case queryProgress: QueryProgressEvent => listener.onQueryProgress(queryProgress) case queryTerminated: QueryTerminatedEvent => listener.onQueryTerminated(queryTerminated) case _ => } } }
Example 26
Source File: StreamingQueryListenerBus.scala From XSQL with Apache License 2.0 | 4 votes |
package org.apache.spark.sql.execution.streaming import java.util.UUID import scala.collection.mutable import org.apache.spark.scheduler.{LiveListenerBus, SparkListener, SparkListenerEvent} import org.apache.spark.sql.streaming.StreamingQueryListener import org.apache.spark.util.ListenerBus override protected def doPostEvent( listener: StreamingQueryListener, event: StreamingQueryListener.Event): Unit = { def shouldReport(runId: UUID): Boolean = { activeQueryRunIds.synchronized { activeQueryRunIds.contains(runId) } } event match { case queryStarted: QueryStartedEvent => if (shouldReport(queryStarted.runId)) { listener.onQueryStarted(queryStarted) } case queryProgress: QueryProgressEvent => if (shouldReport(queryProgress.progress.runId)) { listener.onQueryProgress(queryProgress) } case queryTerminated: QueryTerminatedEvent => if (shouldReport(queryTerminated.runId)) { listener.onQueryTerminated(queryTerminated) } case _ => } } } object StreamingQueryListenerBus { val STREAM_EVENT_QUERY = "streams" }