org.apache.spark.util.AccumulatorV2 Scala Examples
The following examples show how to use org.apache.spark.util.AccumulatorV2.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ByKeyAdditiveAccumulator.scala From spark-records with Apache License 2.0 | 5 votes |
package com.swoop.spark.accumulators import java.util.Collections import org.apache.spark.util.AccumulatorV2 override lazy val value: java.util.Map[A, B] = Collections.synchronizedMap(_map) // Delaying full synchronization allows merge() to be faster as it uses unsafeAdd() override def isZero: Boolean = _map.isEmpty override def copyAndReset(): ByKeyAdditiveAccumulator[A, B] = new ByKeyAdditiveAccumulator() override def copy(): ByKeyAdditiveAccumulator[A, B] = { val newAcc = new ByKeyAdditiveAccumulator[A, B] _map.synchronized { newAcc._map.putAll(_map) } newAcc } override def reset(): Unit = _map.clear() override def add(v: (A, B)): Unit = _map.synchronized { unsafeAdd(v._1, v._2) } override def merge(other: AccumulatorV2[(A, B), java.util.Map[A, B]]): Unit = other match { case o: ByKeyAdditiveAccumulator[A, B] => _map.synchronized { other.synchronized { import scala.collection.JavaConversions._ o._map.foreach((unsafeAdd _).tupled) } } case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } private def unsafeAdd(k: A, v: B) = { val num = implicitly[Numeric[B]] val existing = if (_map.containsKey(k)) _map.get(k) else num.zero _map.put(k, num.plus(existing, v)) } }
Example 2
Source File: MultivariateOnlineSummarizerAccumulator.scala From sparkpipe-core with Apache License 2.0 | 5 votes |
package software.uncharted.sparkpipe.ops.core.dataframe.numeric.util import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.sql.Row import org.apache.spark.util.AccumulatorV2 import org.apache.spark.sql.types.StructType import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer private object MultivariateOnlineSummarizerAccumulator { def init(cols: Seq[_]): Seq[MultivariateOnlineSummarizer] = { cols.map(col => { new MultivariateOnlineSummarizer }).toSeq } } private[numeric] class MultivariateOnlineSummarizerAccumulator( private var result: Seq[MultivariateOnlineSummarizer], private var touched: Boolean = false ) extends AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]] { def this(cols: StructType) { this(MultivariateOnlineSummarizerAccumulator.init(cols)) } override def add(r: Row): Unit = { for (i <- 0 to r.length-1) { if (!r.isNullAt(i)) { result(i).add(Vectors.dense(Array[Double](r.getDouble(i)))) touched = true } else { // don't add a sample to the summarizer for this column } } } override def copy(): AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]] = { new MultivariateOnlineSummarizerAccumulator(result.map(s => { // clone by making a new, empty summarizer and merging our data into it val newSummarizer = new MultivariateOnlineSummarizer() newSummarizer.merge(s) newSummarizer }), false) } override def isZero(): Boolean = { !touched } override def merge(other: AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]]): Unit = { for (i <- 0 to other.value.length-1) { result(i).merge(other.value(i)) } } override def reset(): Unit = { result = MultivariateOnlineSummarizerAccumulator.init(result) touched = false } override def value: Seq[MultivariateOnlineSummarizer] = { result } }
Example 3
Source File: UniqueTermAccumulator.scala From sparkpipe-core with Apache License 2.0 | 5 votes |
package software.uncharted.sparkpipe.ops.core.dataframe.text.util import org.apache.spark.sql.Row import org.apache.spark.util.AccumulatorV2 import scala.collection.mutable.HashMap private[text] class UniqueTermAccumulator( private var result: HashMap[String, Int], private var touched: Boolean = false ) extends AccumulatorV2[Seq[String], HashMap[String, Int]] { def this() { this(new HashMap[String, Int]()) } override def add(in: Seq[String]): Unit = { in.foreach(w => { result.put(w, result.getOrElse(w, 0) + 1) }) } override def copy(): AccumulatorV2[Seq[String], HashMap[String, Int]] = { val clone = new HashMap[String, Int]() result.foreach(kv => clone.put(kv._1, kv._2)) new UniqueTermAccumulator(clone, false) } override def isZero(): Boolean = { !touched } override def merge(other: AccumulatorV2[Seq[String], HashMap[String, Int]]): Unit = { other.value.foreach(t => { result.put(t._1, result.getOrElse(t._1, 0) + t._2) }) } override def reset(): Unit = { result.clear touched = false } override def value: HashMap[String, Int] = { result } }
Example 4
Source File: CoverageUpdate.scala From bdg-sequila with Apache License 2.0 | 5 votes |
package org.biodatageeks.sequila.coverage import org.apache.spark.util.AccumulatorV2 import scala.collection.mutable.ArrayBuffer case class RightCovEdge(contig: String, minPos: Int, startPoint: Int, cov: Array[Short], cumSum: Short) case class ContigRange(contig: String, minPos: Int, maxPos: Int) class CovUpdate(var right: ArrayBuffer[RightCovEdge], var left: ArrayBuffer[ContigRange]) extends Serializable { def reset(): Unit = { right = new ArrayBuffer[RightCovEdge]() left = new ArrayBuffer[ContigRange]() } def add(p: CovUpdate): CovUpdate = { right = right ++ p.right left = left ++ p.left this } } class CoverageAccumulatorV2(var covAcc: CovUpdate) extends AccumulatorV2[CovUpdate, CovUpdate] { def reset(): Unit = { covAcc = new CovUpdate(new ArrayBuffer[RightCovEdge](), new ArrayBuffer[ContigRange]()) } def add(v: CovUpdate): Unit = { covAcc.add(v) } def value(): CovUpdate = { covAcc } def isZero(): Boolean = { covAcc.right.isEmpty && covAcc.left.isEmpty } def copy(): CoverageAccumulatorV2 = { new CoverageAccumulatorV2(covAcc) } def merge(other: AccumulatorV2[CovUpdate, CovUpdate]): Unit = { covAcc.add(other.value) } }
Example 5
Source File: ExternalClusterManagerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext { test("launch of backend and scheduler") { val conf = new SparkConf().setMaster("myclusterManager"). setAppName("testcm").set("spark.driver.allowMultipleContexts", "true") sc = new SparkContext(conf) // check if the scheduler components are created and initialized sc.schedulerBackend match { case dummy: DummySchedulerBackend => assert(dummy.initialized) case other => fail(s"wrong scheduler backend: ${other}") } sc.taskScheduler match { case dummy: DummyTaskScheduler => assert(dummy.initialized) case other => fail(s"wrong task scheduler: ${other}") } } } private class DummyExternalClusterManager extends ExternalClusterManager { def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager" def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = new DummyTaskScheduler def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend() def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[DummyTaskScheduler].initialized = true backend.asInstanceOf[DummySchedulerBackend].initialized = true } } private class DummySchedulerBackend extends SchedulerBackend { var initialized = false def start() {} def stop() {} def reviveOffers() {} def defaultParallelism(): Int = 1 } private class DummyTaskScheduler extends TaskScheduler { var initialized = false override def schedulingMode: SchedulingMode = SchedulingMode.FIFO override def rootPool: Pool = new Pool("", schedulingMode, 0, 0) override def start(): Unit = {} override def stop(): Unit = {} override def submitTasks(taskSet: TaskSet): Unit = {} override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {} override def killTaskAttempt( taskId: Long, interruptThread: Boolean, reason: String): Boolean = false override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def workerRemoved(workerId: String, host: String, message: String): Unit = {} override def applicationAttemptId(): Option[String] = None def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId): Boolean = true }
Example 6
Source File: DAGSchedulerEvent.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.util.Properties import scala.language.existentials import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.util.{AccumulatorV2, CallSite} private[scheduler] case class MapStageSubmitted( jobId: Int, dependency: ShuffleDependency[_, _, _], callSite: CallSite, listener: JobListener, properties: Properties = null) extends DAGSchedulerEvent private[scheduler] case class StageCancelled( stageId: Int, reason: Option[String]) extends DAGSchedulerEvent private[scheduler] case class JobCancelled( jobId: Int, reason: Option[String]) extends DAGSchedulerEvent private[scheduler] case class JobGroupCancelled(groupId: String) extends DAGSchedulerEvent private[scheduler] case object AllJobsCancelled extends DAGSchedulerEvent private[scheduler] case class BeginEvent(task: Task[_], taskInfo: TaskInfo) extends DAGSchedulerEvent private[scheduler] case class GettingResultEvent(taskInfo: TaskInfo) extends DAGSchedulerEvent private[scheduler] case class CompletionEvent( task: Task[_], reason: TaskEndReason, result: Any, accumUpdates: Seq[AccumulatorV2[_, _]], taskInfo: TaskInfo) extends DAGSchedulerEvent private[scheduler] case class ExecutorAdded(execId: String, host: String) extends DAGSchedulerEvent private[scheduler] case class ExecutorLost(execId: String, reason: ExecutorLossReason) extends DAGSchedulerEvent private[scheduler] case class WorkerRemoved(workerId: String, host: String, message: String) extends DAGSchedulerEvent private[scheduler] case class TaskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Throwable]) extends DAGSchedulerEvent private[scheduler] case object ResubmitFailedStages extends DAGSchedulerEvent private[scheduler] case class SpeculativeTaskSubmitted(task: Task[_]) extends DAGSchedulerEvent
Example 7
Source File: TaskResult.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 8
Source File: EventTimeWatermarkExec.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 9
Source File: ExternalClusterManagerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext { test("launch of backend and scheduler") { val conf = new SparkConf().setMaster("myclusterManager"). setAppName("testcm").set("spark.driver.allowMultipleContexts", "true") sc = new SparkContext(conf) // check if the scheduler components are created and initialized sc.schedulerBackend match { case dummy: DummySchedulerBackend => assert(dummy.initialized) case other => fail(s"wrong scheduler backend: ${other}") } sc.taskScheduler match { case dummy: DummyTaskScheduler => assert(dummy.initialized) case other => fail(s"wrong task scheduler: ${other}") } } } private class DummyExternalClusterManager extends ExternalClusterManager { def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager" def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = new DummyTaskScheduler def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend() def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[DummyTaskScheduler].initialized = true backend.asInstanceOf[DummySchedulerBackend].initialized = true } } private class DummySchedulerBackend extends SchedulerBackend { var initialized = false def start() {} def stop() {} def reviveOffers() {} def defaultParallelism(): Int = 1 } private class DummyTaskScheduler extends TaskScheduler { var initialized = false override def rootPool: Pool = null override def schedulingMode: SchedulingMode = SchedulingMode.NONE override def start(): Unit = {} override def stop(): Unit = {} override def submitTasks(taskSet: TaskSet): Unit = {} override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {} override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def applicationAttemptId(): Option[String] = None def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId): Boolean = true }
Example 10
Source File: TaskResult.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get(user).serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 11
Source File: EventTimeWatermarkExec.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { override def user: String = child.user val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 12
Source File: SQLMetrics.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.metric import java.text.NumberFormat import java.util.Locale import org.apache.spark.SparkContext import org.apache.spark.scheduler.AccumulableInfo import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils} class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] { // This is a workaround for SPARK-11013. // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will // update it at the end of task and the value will be at least 0. Then we can filter out the -1 // values before calculate max, min, etc. private[this] var _value = initValue private var _zeroValue = initValue override def copy(): SQLMetric = { val newAcc = new SQLMetric(metricType, _value) newAcc._zeroValue = initValue newAcc } override def reset(): Unit = _value = _zeroValue override def merge(other: AccumulatorV2[Long, Long]): Unit = other match { case o: SQLMetric => _value += o.value case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } override def isZero(): Boolean = _value == _zeroValue override def add(v: Long): Unit = _value += v def +=(v: Long): Unit = _value += v override def value: Long = _value // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = { new AccumulableInfo( id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER)) } } object SQLMetrics { private val SUM_METRIC = "sum" private val SIZE_METRIC = "size" private val TIMING_METRIC = "timing" def createMetric(sc: SparkContext, name: String): SQLMetric = { val acc = new SQLMetric(SUM_METRIC) acc.register(sc, name = Some(name), countFailedValues = false) acc } def stringValue(metricsType: String, values: Seq[Long]): String = { if (metricsType == SUM_METRIC) { val numberFormat = NumberFormat.getIntegerInstance(Locale.US) numberFormat.format(values.sum) } else { val strFormat: Long => String = if (metricsType == SIZE_METRIC) { Utils.bytesToString } else if (metricsType == TIMING_METRIC) { Utils.msDurationToString } else { throw new IllegalStateException("unexpected metrics type: " + metricsType) } val validValues = values.filter(_ >= 0) val Seq(sum, min, med, max) = { val metric = if (validValues.isEmpty) { Seq.fill(4)(0L) } else { val sorted = validValues.sorted Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1)) } metric.map(strFormat) } s"\n$sum ($min, $med, $max)" } } }
Example 13
Source File: DruidQueryExecutionMetric.scala From spark-druid-olap with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.sparklinedata.execution.metrics import java.util.{ArrayList, Collections} import org.apache.spark.util.AccumulatorV2 import org.sparklinedata.druid.metadata.{DruidQueryExecutionView, DruidQueryHistory} class DruidQueryExecutionMetric extends AccumulatorV2[DruidQueryExecutionView, java.util.List[DruidQueryExecutionView]] { import scala.collection.JavaConverters._ private val _list: java.util.List[DruidQueryExecutionView] = Collections.synchronizedList(new ArrayList[DruidQueryExecutionView]()) private def getList : java.util.List[DruidQueryExecutionView] = { if (isAtDriverSide) DruidQueryHistory.getHistory.asJava else _list } override def isZero: Boolean = { _list.isEmpty } override def copy(): DruidQueryExecutionMetric = { val newAcc = new DruidQueryExecutionMetric newAcc._list.addAll(_list) newAcc } override def reset(): Unit = { _list.clear() } override def add(v: DruidQueryExecutionView): Unit = { if (isAtDriverSide) DruidQueryHistory.add(v) else _list.add(v) } private def addAll(v: java.util.List[DruidQueryExecutionView]): Unit = { v.asScala.foreach(add(_)) } override def merge(other: AccumulatorV2[DruidQueryExecutionView, java.util.List[DruidQueryExecutionView]]): Unit = other match { case o: DruidQueryExecutionMetric => { addAll(o._list) } case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } override def value = _list.synchronized { java.util.Collections.unmodifiableList(getList) } private[spark] def setValue(newValue: java.util.List[DruidQueryExecutionView]): Unit = { reset() addAll(newValue) } }
Example 14
Source File: ArrayAccumulator.scala From delta with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.delta.stats import java.io.{ObjectInput, ObjectOutput} import org.apache.spark.util.AccumulatorV2 class ArrayAccumulator(val size: Int) extends AccumulatorV2[(Int, Long), Array[Long]] { protected val counts = new Array[Long](size) override def isZero: Boolean = counts.forall(_ == 0) override def copy(): AccumulatorV2[(Int, Long), Array[Long]] = { val newCopy = new ArrayAccumulator(size) (0 until size).foreach(i => newCopy.counts(i) = counts(i)) newCopy } override def reset(): Unit = (0 until size).foreach(counts(_) = 0) override def add(v: (Int, Long)): Unit = { if (v._2 == -1 || counts(v._1) == -1) { counts(v._1) = -1 } else { counts(v._1) += v._2 } } override def merge(o: AccumulatorV2[(Int, Long), Array[Long]]): Unit = { val other = o.asInstanceOf[ArrayAccumulator] assert(size == other.size) (0 until size).foreach(i => { if (counts(i) == -1 || other.counts(i) == -1) { counts(i) = -1 } else { counts(i) += other.counts(i) } }) } override def value: Array[Long] = counts }
Example 15
Source File: Accumulators.scala From spark-nlp with Apache License 2.0 | 5 votes |
package com.johnsnowlabs.nlp.annotators.pos.perceptron import org.apache.spark.util.AccumulatorV2 import scala.collection.mutable.{ArrayBuffer, Map => MMap} class TupleKeyLongDoubleMapAccumulator(defaultMap: MMap[(String, String), (Long, Double)] = MMap.empty[(String, String), (Long, Double)]) extends AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]] { val mmap = defaultMap override def reset(): Unit = mmap.clear() override def add(v: ((String, String), (Long, Double))): Unit = { mmap(v._1) = mmap.get(v._1).map{case (v1, v2) => ((v1 + v._2._1)/2, (v2 + v._2._2)/2)}.getOrElse(v._2) } def updateMany(other: MMap[(String, String), (Long, Double)]): Unit = { other.foreach{case (k, v) => this.add((k, v)) } } override def value: Map[(String, String), (Long, Double)] = mmap.toMap override def copy(): AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]] = { val m = ArrayBuffer.empty[((String, String), (Long, Double))] this.mmap.copyToBuffer(m) new TupleKeyLongDoubleMapAccumulator(MMap(m:_*)) } override def isZero: Boolean = mmap.isEmpty override def merge(other: AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]]): Unit = { other match { case o: TupleKeyLongDoubleMapAccumulator => updateMany(o.mmap) case _ => throw new Exception("Cannot merge tuple key long") } } } class StringMapStringDoubleAccumulator(defaultMap: MMap[String, MMap[String, Double]] = MMap.empty[String, MMap[String, Double]]) extends AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]] { private val mmap = defaultMap override def reset(): Unit = mmap.clear() override def add(v: (String, MMap[String, Double])): Unit = { v._2.foreach{case (kk, vv) => val loc = mmap.getOrElse(v._1, MMap.empty[String, Double]) val nv = if (loc.isDefinedAt(kk)) (loc.getOrElse(kk, 0.0) + vv) / 2.0 else vv mmap.update(v._1, loc.updated(kk, nv)) } } override def value: Map[String, Map[String, Double]] = mmap.mapValues(_.toMap.filterNot(a => a._2 == 0)).toMap override def copy(): AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]] = { val m = ArrayBuffer.empty[(String, MMap[String, Double])] this.mmap.copyToBuffer(m) new StringMapStringDoubleAccumulator(MMap(m:_*)) } override def isZero: Boolean = mmap.isEmpty def addMany(other: MMap[String, MMap[String, Double]]) = { other.foreach { case (k,v) => this.add((k,v)) } } override def merge(other: AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]]): Unit = { other match { case o: StringMapStringDoubleAccumulator => addMany(o.mmap) case _ => throw new Exception("Wrong StringMapStringDouble merge") } } }
Example 16
Source File: package.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.util.Collections import scala.collection.JavaConverters._ import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.plans.physical.Partitioning import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.util.{AccumulatorV2, LongAccumulator} case class ColumnMetrics() { val elementTypes = new SetAccumulator[String] sparkContext.register(elementTypes) } val tupleCount: LongAccumulator = sparkContext.longAccumulator val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { debugPrint(s"== ${child.simpleString} ==") debugPrint(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case (attr, metric) => // This is called on driver. All accumulator updates have a fixed value. So it's safe to use // `asScala` which accesses the internal values using `java.util.Iterator`. val actualDataTypes = metric.elementTypes.value.asScala.mkString("{", ",", "}") debugPrint(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount.add(1) var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes.add(value.getClass.getName) } i += 1 } currentRow } } } } override def outputPartitioning: Partitioning = child.outputPartitioning override def inputRDDs(): Seq[RDD[InternalRow]] = { child.asInstanceOf[CodegenSupport].inputRDDs() } override def doProduce(ctx: CodegenContext): String = { child.asInstanceOf[CodegenSupport].produce(ctx, this) } override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = { consume(ctx, input) } } }
Example 17
Source File: ExternalClusterManagerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext { test("launch of backend and scheduler") { val conf = new SparkConf().setMaster("myclusterManager"). setAppName("testcm").set("spark.driver.allowMultipleContexts", "true") sc = new SparkContext(conf) // check if the scheduler components are created and initialized sc.schedulerBackend match { case dummy: DummySchedulerBackend => assert(dummy.initialized) case other => fail(s"wrong scheduler backend: ${other}") } sc.taskScheduler match { case dummy: DummyTaskScheduler => assert(dummy.initialized) case other => fail(s"wrong task scheduler: ${other}") } } } private class DummyExternalClusterManager extends ExternalClusterManager { def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager" def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = new DummyTaskScheduler def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend() def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[DummyTaskScheduler].initialized = true backend.asInstanceOf[DummySchedulerBackend].initialized = true } } private class DummySchedulerBackend extends SchedulerBackend { var initialized = false def start() {} def stop() {} def reviveOffers() {} def defaultParallelism(): Int = 1 } private class DummyTaskScheduler extends TaskScheduler { var initialized = false override def rootPool: Pool = null override def schedulingMode: SchedulingMode = SchedulingMode.NONE override def start(): Unit = {} override def stop(): Unit = {} override def submitTasks(taskSet: TaskSet): Unit = {} override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {} override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def applicationAttemptId(): Option[String] = None def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId): Boolean = true }
Example 18
Source File: TaskResult.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 19
Source File: EventTimeWatermarkExec.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.SparkPlan import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends SparkPlan { val eventTimeStats = new EventTimeStatsAccum() sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delay.milliseconds) .build() a.withMetadata(updatedMetadata) } else { a } } override def children: Seq[SparkPlan] = child :: Nil }
Example 20
Source File: SQLMetrics.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.metric import java.text.NumberFormat import java.util.Locale import org.apache.spark.SparkContext import org.apache.spark.scheduler.AccumulableInfo import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils} class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] { // This is a workaround for SPARK-11013. // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will // update it at the end of task and the value will be at least 0. Then we can filter out the -1 // values before calculate max, min, etc. private[this] var _value = initValue private var _zeroValue = initValue override def copy(): SQLMetric = { val newAcc = new SQLMetric(metricType, _value) newAcc._zeroValue = initValue newAcc } override def reset(): Unit = _value = _zeroValue override def merge(other: AccumulatorV2[Long, Long]): Unit = other match { case o: SQLMetric => _value += o.value case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } override def isZero(): Boolean = _value == _zeroValue override def add(v: Long): Unit = _value += v def +=(v: Long): Unit = _value += v override def value: Long = _value // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = { new AccumulableInfo( id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER)) } } object SQLMetrics { private val SUM_METRIC = "sum" private val SIZE_METRIC = "size" private val TIMING_METRIC = "timing" def createMetric(sc: SparkContext, name: String): SQLMetric = { val acc = new SQLMetric(SUM_METRIC) acc.register(sc, name = Some(name), countFailedValues = false) acc } def stringValue(metricsType: String, values: Seq[Long]): String = { if (metricsType == SUM_METRIC) { val numberFormat = NumberFormat.getIntegerInstance(Locale.US) numberFormat.format(values.sum) } else { val strFormat: Long => String = if (metricsType == SIZE_METRIC) { Utils.bytesToString } else if (metricsType == TIMING_METRIC) { Utils.msDurationToString } else { throw new IllegalStateException("unexpected metrics type: " + metricsType) } val validValues = values.filter(_ >= 0) val Seq(sum, min, med, max) = { val metric = if (validValues.isEmpty) { Seq.fill(4)(0L) } else { val sorted = validValues.sorted Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1)) } metric.map(strFormat) } s"\n$sum ($min, $med, $max)" } } }
Example 21
Source File: MapAccumulator.scala From gemini with GNU General Public License v3.0 | 5 votes |
package tech.sourced.gemini.util import org.apache.spark.util.AccumulatorV2 import scala.collection.mutable class MapAccumulator extends AccumulatorV2[(String, Int), Map[String, Int]] { private val underlyingMap: mutable.HashMap[String, Int] = mutable.HashMap.empty override def isZero: Boolean = underlyingMap.isEmpty override def copy(): AccumulatorV2[(String, Int), Map[String, Int]] = { val newMapAccumulator = new MapAccumulator() underlyingMap.foreach(newMapAccumulator.add) newMapAccumulator } override def reset(): Unit = underlyingMap.clear override def value: Map[String, Int] = underlyingMap.toMap override def add(kv: (String, Int)): Unit = { val (k, v) = kv underlyingMap += k -> (underlyingMap.getOrElse(k, 0) + v) } override def merge(other: AccumulatorV2[(String, Int), Map[String, Int]]): Unit = other match { case map: MapAccumulator => map.value.foreach(this.add) case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } }
Example 22
Source File: ExceptionCountAccumulator.scala From spark-distcp with Apache License 2.0 | 5 votes |
package com.coxautodata.objects import java.util import java.util.Collections import java.util.function.{BiConsumer, BiFunction} import org.apache.spark.util.AccumulatorV2 class ExceptionCountAccumulator extends AccumulatorV2[String, java.util.Map[String, Long]] { private val _map: java.util.Map[String, Long] = Collections.synchronizedMap(new util.HashMap[String, Long]()) override def isZero: Boolean = _map.isEmpty override def copyAndReset(): ExceptionCountAccumulator = new ExceptionCountAccumulator override def copy(): ExceptionCountAccumulator = { val newAcc = new ExceptionCountAccumulator _map.synchronized { newAcc._map.putAll(_map) } newAcc } override def reset(): Unit = _map.clear() def add(e: Throwable): Unit = add(e.getClass.getName.stripSuffix("$")) override def add(k: String): Unit = { add(k, 1) } private def add(k: String, v: Long): Unit = { _map.merge(k, v, CombineCounts) } override def merge(other: AccumulatorV2[String, util.Map[String, Long]]): Unit = { other match { case e: ExceptionCountAccumulator => e._map.forEach { new BiConsumer[String, Long] { override def accept(k: String, v: Long): Unit = add(k, v) } } case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } } override def value: util.Map[String, Long] = _map } object CombineCounts extends BiFunction[Long, Long, Long] { override def apply(t: Long, u: Long): Long = t + u }
Example 23
Source File: Histogram.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark.accumulator import org.apache.spark.SparkContext import org.apache.spark.util.AccumulatorV2 import scala.collection.immutable.SortedMap import scala.collection.mutable case class Histogram[T: Ordering](var map: mutable.Map[T, Long] = mutable.Map.empty[T, Long]) extends AccumulatorV2[T, SortedMap[T, Long]] { override def isZero: Boolean = map.isEmpty override def copy(): AccumulatorV2[T, SortedMap[T, Long]] = Histogram(map.clone()) override def reset(): Unit = map = mutable.Map.empty[T, Long] override def add(k: T): Unit = map.update( k, map.getOrElse(k, 0L) + 1 ) override def merge(other: AccumulatorV2[T, SortedMap[T, Long]]): Unit = for { (k, v) ← other.value } { map.update(k, map.getOrElse(k, 0L) + v) } override def value: SortedMap[T, Long] = SortedMap(map.toSeq: _*) } object Histogram { def apply[T: Ordering](name: String)(implicit sc: SparkContext): Histogram[T] = { val a = Histogram[T]() sc.register(a, name) a } }
Example 24
Source File: EventTimeWatermarkExec.scala From XSQL with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.streaming import org.apache.spark.rdd.RDD import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection} import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode} import org.apache.spark.sql.types.MetadataBuilder import org.apache.spark.unsafe.types.CalendarInterval import org.apache.spark.util.AccumulatorV2 case class EventTimeWatermarkExec( eventTime: Attribute, delay: CalendarInterval, child: SparkPlan) extends UnaryExecNode { val eventTimeStats = new EventTimeStatsAccum() val delayMs = EventTimeWatermark.getDelayMs(delay) sparkContext.register(eventTimeStats) override protected def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output) iter.map { row => eventTimeStats.add(getEventTime(row).getLong(0) / 1000) row } } } // Update the metadata on the eventTime column to include the desired delay. override val output: Seq[Attribute] = child.output.map { a => if (a semanticEquals eventTime) { val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .putLong(EventTimeWatermark.delayKey, delayMs) .build() a.withMetadata(updatedMetadata) } else if (a.metadata.contains(EventTimeWatermark.delayKey)) { // Remove existing watermark val updatedMetadata = new MetadataBuilder() .withMetadata(a.metadata) .remove(EventTimeWatermark.delayKey) .build() a.withMetadata(updatedMetadata) } else { a } } }
Example 25
Source File: CacheInvalidateAccumulator.scala From tispark with Apache License 2.0 | 5 votes |
package com.pingcap.tispark.accumulator import java.util import com.pingcap.tikv.event.CacheInvalidateEvent import org.apache.spark.util.AccumulatorV2 import scala.collection.JavaConversions._ class CacheInvalidateAccumulator extends AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]] { private final val eventSet: util.Set[CacheInvalidateEvent] = new util.HashSet[CacheInvalidateEvent] override def isZero: Boolean = eventSet.isEmpty override def reset(): Unit = eventSet.clear() override def add(v: CacheInvalidateEvent): Unit = eventSet.synchronized { eventSet.add(v) } override def copy(): AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]] = { val accumulator = new CacheInvalidateAccumulator eventSet.synchronized { accumulator.eventSet.addAll(eventSet) } accumulator } override def merge( other: AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]]): Unit = eventSet.addAll(other.value) override def value: Seq[CacheInvalidateEvent] = eventSet.toList def remove(event: CacheInvalidateEvent): Boolean = eventSet.synchronized { eventSet.remove(event) } }
Example 26
Source File: CustomAccumulator.scala From HadoopLearning with MIT License | 5 votes |
package com.liumm.transform import org.apache.commons.lang3.StringUtils import org.apache.spark.util.AccumulatorV2 class CustomAccumulator extends AccumulatorV2[String, String] { var result = "" //默认值 override def isZero: Boolean = { result == "" } override def copy(): AccumulatorV2[String, String] = { val customAccumulator = new CustomAccumulator() customAccumulator.result = this.result customAccumulator } override def reset(): Unit = { result = "" } override def add(v: String): Unit = { if (StringUtils.isNoneBlank(v)) { if (isZero) { result = v } else { result += "|" + v } } } override def merge(other: AccumulatorV2[String, String]): Unit = other match { case newAc: CustomAccumulator => if (isZero) result = newAc.value else result += "|" + newAc.value case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}" ) } override def value: String = { result } }
Example 27
Source File: SparkSolrAccumulator.scala From spark-solr with Apache License 2.0 | 5 votes |
package com.lucidworks.spark import java.lang.Long import org.apache.spark.util.AccumulatorV2 class SparkSolrAccumulator extends AccumulatorV2[java.lang.Long, java.lang.Long] { private var _count = 0L override def isZero: Boolean = _count == 0 override def copy(): SparkSolrAccumulator = { val newAcc = new SparkSolrAccumulator newAcc._count = this._count newAcc } override def reset(): Unit = { _count = 0L } override def add(v: Long): Unit = { _count += v } def count: Long = _count override def merge(other: AccumulatorV2[Long, Long]): Unit = other match { case o: SparkSolrAccumulator => _count += o.count case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } override def value: Long = _count def inc(): Unit = _count += 1 }
Example 28
Source File: ExternalClusterManagerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext { test("launch of backend and scheduler") { val conf = new SparkConf().setMaster("myclusterManager"). setAppName("testcm").set("spark.driver.allowMultipleContexts", "true") sc = new SparkContext(conf) // check if the scheduler components are created and initialized sc.schedulerBackend match { case dummy: DummySchedulerBackend => assert(dummy.initialized) case other => fail(s"wrong scheduler backend: ${other}") } sc.taskScheduler match { case dummy: DummyTaskScheduler => assert(dummy.initialized) case other => fail(s"wrong task scheduler: ${other}") } } } private class DummyExternalClusterManager extends ExternalClusterManager { def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager" def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = new DummyTaskScheduler def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend() def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[DummyTaskScheduler].initialized = true backend.asInstanceOf[DummySchedulerBackend].initialized = true } } private class DummySchedulerBackend extends SchedulerBackend { var initialized = false def start() {} def stop() {} def reviveOffers() {} def defaultParallelism(): Int = 1 } private class DummyTaskScheduler extends TaskScheduler { var initialized = false override def rootPool: Pool = null override def schedulingMode: SchedulingMode = SchedulingMode.NONE override def start(): Unit = {} override def stop(): Unit = {} override def submitTasks(taskSet: TaskSet): Unit = {} override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {} override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def applicationAttemptId(): Option[String] = None def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId): Boolean = true }
Example 29
Source File: TaskResult.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 30
Source File: SQLMetrics.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution.metric import java.text.NumberFormat import java.util.Locale import org.apache.spark.SparkContext import org.apache.spark.scheduler.AccumulableInfo import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils} class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] { // This is a workaround for SPARK-11013. // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will // update it at the end of task and the value will be at least 0. Then we can filter out the -1 // values before calculate max, min, etc. private[this] var _value = initValue private var _zeroValue = initValue override def copy(): SQLMetric = { val newAcc = new SQLMetric(metricType, _value) newAcc._zeroValue = initValue newAcc } override def reset(): Unit = _value = _zeroValue override def merge(other: AccumulatorV2[Long, Long]): Unit = other match { case o: SQLMetric => _value += o.value case _ => throw new UnsupportedOperationException( s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}") } override def isZero(): Boolean = _value == _zeroValue override def add(v: Long): Unit = _value += v def +=(v: Long): Unit = _value += v override def value: Long = _value // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = { new AccumulableInfo( id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER)) } } object SQLMetrics { private val SUM_METRIC = "sum" private val SIZE_METRIC = "size" private val TIMING_METRIC = "timing" def createMetric(sc: SparkContext, name: String): SQLMetric = { val acc = new SQLMetric(SUM_METRIC) acc.register(sc, name = Some(name), countFailedValues = false) acc } def stringValue(metricsType: String, values: Seq[Long]): String = { if (metricsType == SUM_METRIC) { val numberFormat = NumberFormat.getIntegerInstance(Locale.ENGLISH) numberFormat.format(values.sum) } else { val strFormat: Long => String = if (metricsType == SIZE_METRIC) { Utils.bytesToString } else if (metricsType == TIMING_METRIC) { Utils.msDurationToString } else { throw new IllegalStateException("unexpected metrics type: " + metricsType) } val validValues = values.filter(_ >= 0) val Seq(sum, min, med, max) = { val metric = if (validValues.isEmpty) { Seq.fill(4)(0L) } else { val sorted = validValues.sorted Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1)) } metric.map(strFormat) } s"\n$sum ($min, $med, $max)" } } }