org.apache.spark.util.AccumulatorV2 Scala Example

Source File: ByKeyAdditiveAccumulator.scala From spark-records with Apache License 2.0

5 votes

package com.swoop.spark.accumulators

import java.util.Collections

import org.apache.spark.util.AccumulatorV2


  override lazy val value: java.util.Map[A, B] =
    Collections.synchronizedMap(_map) // Delaying full synchronization allows merge() to be faster as it uses unsafeAdd()

  override def isZero: Boolean =
    _map.isEmpty

  override def copyAndReset(): ByKeyAdditiveAccumulator[A, B] =
    new ByKeyAdditiveAccumulator()

  override def copy(): ByKeyAdditiveAccumulator[A, B] = {
    val newAcc = new ByKeyAdditiveAccumulator[A, B]
    _map.synchronized {
      newAcc._map.putAll(_map)
    }
    newAcc
  }

  override def reset(): Unit =
    _map.clear()

  override def add(v: (A, B)): Unit =
    _map.synchronized {
      unsafeAdd(v._1, v._2)
    }

  override def merge(other: AccumulatorV2[(A, B), java.util.Map[A, B]]): Unit =
    other match {
      case o: ByKeyAdditiveAccumulator[A, B] =>
        _map.synchronized {
          other.synchronized {
            import scala.collection.JavaConversions._
            o._map.foreach((unsafeAdd _).tupled)
          }
        }
      case _ => throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
    }

  private def unsafeAdd(k: A, v: B) = {
    val num = implicitly[Numeric[B]]
    val existing = if (_map.containsKey(k)) _map.get(k) else num.zero
    _map.put(k, num.plus(existing, v))
  }

}

Source File: MultivariateOnlineSummarizerAccumulator.scala From sparkpipe-core with Apache License 2.0

5 votes

package software.uncharted.sparkpipe.ops.core.dataframe.numeric.util

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.sql.Row
import org.apache.spark.util.AccumulatorV2
import org.apache.spark.sql.types.StructType
import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer

private object MultivariateOnlineSummarizerAccumulator {
  def init(cols: Seq[_]): Seq[MultivariateOnlineSummarizer] = {
    cols.map(col => {
      new MultivariateOnlineSummarizer
    }).toSeq
  }
}


private[numeric] class MultivariateOnlineSummarizerAccumulator(
  private var result: Seq[MultivariateOnlineSummarizer],
  private var touched: Boolean = false
) extends AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]] {

  def this(cols: StructType) {
    this(MultivariateOnlineSummarizerAccumulator.init(cols))
  }

  override def add(r: Row): Unit = {
    for (i <- 0 to r.length-1) {
      if (!r.isNullAt(i)) {
        result(i).add(Vectors.dense(Array[Double](r.getDouble(i))))
        touched = true
      } else {
        // don't add a sample to the summarizer for this column
      }
    }
  }

  override def copy(): AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]] = {
    new MultivariateOnlineSummarizerAccumulator(result.map(s => {
      // clone by making a new, empty summarizer and merging our data into it
      val newSummarizer = new MultivariateOnlineSummarizer()
      newSummarizer.merge(s)
      newSummarizer
    }), false)
  }

  override def isZero(): Boolean = {
    !touched
  }

  override def merge(other: AccumulatorV2[Row, Seq[MultivariateOnlineSummarizer]]): Unit = {
    for (i <- 0 to other.value.length-1) {
      result(i).merge(other.value(i))
    }
  }

  override def reset(): Unit = {
    result = MultivariateOnlineSummarizerAccumulator.init(result)
    touched = false
  }

  override def value: Seq[MultivariateOnlineSummarizer] = {
    result
  }
}

Source File: UniqueTermAccumulator.scala From sparkpipe-core with Apache License 2.0

5 votes

package software.uncharted.sparkpipe.ops.core.dataframe.text.util

import org.apache.spark.sql.Row
import org.apache.spark.util.AccumulatorV2
import scala.collection.mutable.HashMap


private[text] class UniqueTermAccumulator(
  private var result: HashMap[String, Int],
  private var touched: Boolean = false
) extends AccumulatorV2[Seq[String], HashMap[String, Int]] {

  def this() {
    this(new HashMap[String, Int]())
  }

  override def add(in: Seq[String]): Unit = {
    in.foreach(w => {
      result.put(w, result.getOrElse(w, 0) + 1)
    })
  }

  override def copy(): AccumulatorV2[Seq[String], HashMap[String, Int]] = {
    val clone = new HashMap[String, Int]()
    result.foreach(kv => clone.put(kv._1, kv._2))
    new UniqueTermAccumulator(clone, false)
  }

  override def isZero(): Boolean = {
    !touched
  }

  override def merge(other: AccumulatorV2[Seq[String], HashMap[String, Int]]): Unit = {
    other.value.foreach(t => {
      result.put(t._1, result.getOrElse(t._1, 0) + t._2)
    })
  }

  override def reset(): Unit = {
    result.clear
    touched = false
  }

  override def value: HashMap[String, Int] = {
    result
  }
}

Source File: CoverageUpdate.scala From bdg-sequila with Apache License 2.0

5 votes

package org.biodatageeks.sequila.coverage

import org.apache.spark.util.AccumulatorV2

import scala.collection.mutable.ArrayBuffer

case class RightCovEdge(contig: String,
                        minPos: Int,
                        startPoint: Int,
                        cov: Array[Short],
                        cumSum: Short)

case class ContigRange(contig: String, minPos: Int, maxPos: Int)

class CovUpdate(var right: ArrayBuffer[RightCovEdge],
                var left: ArrayBuffer[ContigRange])
    extends Serializable {

  def reset(): Unit = {
    right = new ArrayBuffer[RightCovEdge]()
    left = new ArrayBuffer[ContigRange]()
  }
  def add(p: CovUpdate): CovUpdate = {
    right = right ++ p.right
    left = left ++ p.left
    this
  }

}

class CoverageAccumulatorV2(var covAcc: CovUpdate)
    extends AccumulatorV2[CovUpdate, CovUpdate] {

  def reset(): Unit = {
    covAcc = new CovUpdate(new ArrayBuffer[RightCovEdge](),
                           new ArrayBuffer[ContigRange]())
  }

  def add(v: CovUpdate): Unit = {
    covAcc.add(v)
  }
  def value(): CovUpdate = {
    covAcc
  }
  def isZero(): Boolean = {
    covAcc.right.isEmpty && covAcc.left.isEmpty
  }
  def copy(): CoverageAccumulatorV2 = {
    new CoverageAccumulatorV2(covAcc)
  }
  def merge(other: AccumulatorV2[CovUpdate, CovUpdate]): Unit = {
    covAcc.add(other.value)
  }
}

Source File: ExternalClusterManagerSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def schedulingMode: SchedulingMode = SchedulingMode.FIFO
  override def rootPool: Pool = new Pool("", schedulingMode, 0, 0)
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def killTaskAttempt(
    taskId: Long, interruptThread: Boolean, reason: String): Boolean = false
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
}

Source File: DAGSchedulerEvent.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.util.Properties

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.{AccumulatorV2, CallSite}


private[scheduler] case class MapStageSubmitted(
  jobId: Int,
  dependency: ShuffleDependency[_, _, _],
  callSite: CallSite,
  listener: JobListener,
  properties: Properties = null)
  extends DAGSchedulerEvent

private[scheduler] case class StageCancelled(
    stageId: Int,
    reason: Option[String])
  extends DAGSchedulerEvent

private[scheduler] case class JobCancelled(
    jobId: Int,
    reason: Option[String])
  extends DAGSchedulerEvent

private[scheduler] case class JobGroupCancelled(groupId: String) extends DAGSchedulerEvent

private[scheduler] case object AllJobsCancelled extends DAGSchedulerEvent

private[scheduler]
case class BeginEvent(task: Task[_], taskInfo: TaskInfo) extends DAGSchedulerEvent

private[scheduler]
case class GettingResultEvent(taskInfo: TaskInfo) extends DAGSchedulerEvent

private[scheduler] case class CompletionEvent(
    task: Task[_],
    reason: TaskEndReason,
    result: Any,
    accumUpdates: Seq[AccumulatorV2[_, _]],
    taskInfo: TaskInfo)
  extends DAGSchedulerEvent

private[scheduler] case class ExecutorAdded(execId: String, host: String) extends DAGSchedulerEvent

private[scheduler] case class ExecutorLost(execId: String, reason: ExecutorLossReason)
  extends DAGSchedulerEvent

private[scheduler] case class WorkerRemoved(workerId: String, host: String, message: String)
  extends DAGSchedulerEvent

private[scheduler]
case class TaskSetFailed(taskSet: TaskSet, reason: String, exception: Option[Throwable])
  extends DAGSchedulerEvent

private[scheduler] case object ResubmitFailedStages extends DAGSchedulerEvent

private[scheduler]
case class SpeculativeTaskSubmitted(task: Task[_]) extends DAGSchedulerEvent

Source File: TaskResult.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
}

Source File: EventTimeWatermarkExec.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends UnaryExecNode {

  val eventTimeStats = new EventTimeStatsAccum()
  val delayMs = EventTimeWatermark.getDelayMs(delay)

  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .putLong(EventTimeWatermark.delayKey, delayMs)
        .build()
      a.withMetadata(updatedMetadata)
    } else if (a.metadata.contains(EventTimeWatermark.delayKey)) {
      // Remove existing watermark
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .remove(EventTimeWatermark.delayKey)
        .build()
      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }
}

Source File: ExternalClusterManagerSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
}

Source File: TaskResult.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get(user).serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
}

Source File: EventTimeWatermarkExec.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends SparkPlan {

  override def user: String = child.user

  val eventTimeStats = new EventTimeStatsAccum()
  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
          .withMetadata(a.metadata)
          .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
          .build()

      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }

  override def children: Seq[SparkPlan] = child :: Nil
}

Source File: SQLMetrics.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.metric

import java.text.NumberFormat
import java.util.Locale

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.AccumulableInfo
import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils}


class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] {
  // This is a workaround for SPARK-11013.
  // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will
  // update it at the end of task and the value will be at least 0. Then we can filter out the -1
  // values before calculate max, min, etc.
  private[this] var _value = initValue
  private var _zeroValue = initValue

  override def copy(): SQLMetric = {
    val newAcc = new SQLMetric(metricType, _value)
    newAcc._zeroValue = initValue
    newAcc
  }

  override def reset(): Unit = _value = _zeroValue

  override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
    case o: SQLMetric => _value += o.value
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def isZero(): Boolean = _value == _zeroValue

  override def add(v: Long): Unit = _value += v

  def +=(v: Long): Unit = _value += v

  override def value: Long = _value

  // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later
  override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
    new AccumulableInfo(
      id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER))
  }
}


object SQLMetrics {
  private val SUM_METRIC = "sum"
  private val SIZE_METRIC = "size"
  private val TIMING_METRIC = "timing"

  def createMetric(sc: SparkContext, name: String): SQLMetric = {
    val acc = new SQLMetric(SUM_METRIC)
    acc.register(sc, name = Some(name), countFailedValues = false)
    acc
  }

  
  def stringValue(metricsType: String, values: Seq[Long]): String = {
    if (metricsType == SUM_METRIC) {
      val numberFormat = NumberFormat.getIntegerInstance(Locale.US)
      numberFormat.format(values.sum)
    } else {
      val strFormat: Long => String = if (metricsType == SIZE_METRIC) {
        Utils.bytesToString
      } else if (metricsType == TIMING_METRIC) {
        Utils.msDurationToString
      } else {
        throw new IllegalStateException("unexpected metrics type: " + metricsType)
      }

      val validValues = values.filter(_ >= 0)
      val Seq(sum, min, med, max) = {
        val metric = if (validValues.isEmpty) {
          Seq.fill(4)(0L)
        } else {
          val sorted = validValues.sorted
          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
        }
        metric.map(strFormat)
      }
      s"\n$sum ($min, $med, $max)"
    }
  }
}

Source File: DruidQueryExecutionMetric.scala From spark-druid-olap with Apache License 2.0

5 votes

package org.apache.spark.sql.sparklinedata.execution.metrics

import java.util.{ArrayList, Collections}

import org.apache.spark.util.AccumulatorV2
import org.sparklinedata.druid.metadata.{DruidQueryExecutionView, DruidQueryHistory}


class DruidQueryExecutionMetric extends
  AccumulatorV2[DruidQueryExecutionView, java.util.List[DruidQueryExecutionView]] {

  import scala.collection.JavaConverters._

  private val _list: java.util.List[DruidQueryExecutionView] =
    Collections.synchronizedList(new ArrayList[DruidQueryExecutionView]())

  private def getList : java.util.List[DruidQueryExecutionView] = {
    if (isAtDriverSide) DruidQueryHistory.getHistory.asJava else _list
  }

  override def isZero: Boolean = {
    _list.isEmpty
  }

  override def copy(): DruidQueryExecutionMetric = {
    val newAcc = new DruidQueryExecutionMetric
    newAcc._list.addAll(_list)
    newAcc
  }

  override def reset(): Unit = {
    _list.clear()
  }

  override def add(v: DruidQueryExecutionView): Unit = {
    if (isAtDriverSide) DruidQueryHistory.add(v) else _list.add(v)
  }

  private def addAll(v: java.util.List[DruidQueryExecutionView]): Unit = {
   v.asScala.foreach(add(_))
  }

  override def merge(other:
                     AccumulatorV2[DruidQueryExecutionView,
                       java.util.List[DruidQueryExecutionView]]):
  Unit = other match {
    case o: DruidQueryExecutionMetric => {
      addAll(o._list)
    }
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def value = _list.synchronized {
    java.util.Collections.unmodifiableList(getList)
  }

  private[spark] def setValue(newValue: java.util.List[DruidQueryExecutionView]): Unit = {
    reset()
    addAll(newValue)
  }

}

Source File: ArrayAccumulator.scala From delta with Apache License 2.0

5 votes

package org.apache.spark.sql.delta.stats

import java.io.{ObjectInput, ObjectOutput}

import org.apache.spark.util.AccumulatorV2


class ArrayAccumulator(val size: Int) extends AccumulatorV2[(Int, Long), Array[Long]] {

  protected val counts = new Array[Long](size)

  override def isZero: Boolean = counts.forall(_ == 0)
  override def copy(): AccumulatorV2[(Int, Long), Array[Long]] = {
    val newCopy = new ArrayAccumulator(size)
    (0 until size).foreach(i => newCopy.counts(i) = counts(i))
    newCopy
  }
  override def reset(): Unit = (0 until size).foreach(counts(_) = 0)
  override def add(v: (Int, Long)): Unit = {
    if (v._2 == -1 || counts(v._1) == -1) {
      counts(v._1) = -1
    } else {
      counts(v._1) += v._2
    }
  }
  override def merge(o: AccumulatorV2[(Int, Long), Array[Long]]): Unit = {
    val other = o.asInstanceOf[ArrayAccumulator]
    assert(size == other.size)

    (0 until size).foreach(i => {
      if (counts(i) == -1 || other.counts(i) == -1) {
        counts(i) = -1
      } else {
        counts(i) += other.counts(i)
      }
    })
  }
  override def value: Array[Long] = counts

}

Source File: Accumulators.scala From spark-nlp with Apache License 2.0

5 votes

package com.johnsnowlabs.nlp.annotators.pos.perceptron

import org.apache.spark.util.AccumulatorV2

import scala.collection.mutable.{ArrayBuffer, Map => MMap}

class TupleKeyLongDoubleMapAccumulator(defaultMap: MMap[(String, String), (Long, Double)] = MMap.empty[(String, String), (Long, Double)])
  extends AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]] {

  val mmap = defaultMap

  override def reset(): Unit = mmap.clear()

  override def add(v: ((String, String), (Long, Double))): Unit = {
    mmap(v._1) = mmap.get(v._1).map{case (v1, v2) => ((v1 + v._2._1)/2, (v2 + v._2._2)/2)}.getOrElse(v._2)
  }

  def updateMany(other: MMap[(String, String), (Long, Double)]): Unit = {
    other.foreach{case (k, v) =>
      this.add((k, v))
    }
  }

  override def value: Map[(String, String), (Long, Double)] = mmap.toMap

  override def copy(): AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]] = {
    val m = ArrayBuffer.empty[((String, String), (Long, Double))]
    this.mmap.copyToBuffer(m)
    new TupleKeyLongDoubleMapAccumulator(MMap(m:_*))
  }

  override def isZero: Boolean = mmap.isEmpty

  override def merge(other: AccumulatorV2[((String, String), (Long, Double)), Map[(String, String), (Long, Double)]]): Unit = {
    other match {
      case o: TupleKeyLongDoubleMapAccumulator =>
        updateMany(o.mmap)
      case _ => throw new Exception("Cannot merge tuple key long")
    }
  }
}

class StringMapStringDoubleAccumulator(defaultMap: MMap[String, MMap[String, Double]] = MMap.empty[String, MMap[String, Double]])
  extends AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]] {

  private val mmap = defaultMap

  override def reset(): Unit = mmap.clear()

  override def add(v: (String, MMap[String, Double])): Unit = {
    v._2.foreach{case (kk, vv) =>
      val loc = mmap.getOrElse(v._1, MMap.empty[String, Double])
      val nv = if (loc.isDefinedAt(kk)) (loc.getOrElse(kk, 0.0) + vv) / 2.0 else vv
      mmap.update(v._1, loc.updated(kk, nv))
    }
  }

  override def value: Map[String, Map[String, Double]] = mmap.mapValues(_.toMap.filterNot(a => a._2 == 0)).toMap

  override def copy(): AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]] = {
    val m = ArrayBuffer.empty[(String, MMap[String, Double])]
    this.mmap.copyToBuffer(m)
    new StringMapStringDoubleAccumulator(MMap(m:_*))
  }

  override def isZero: Boolean = mmap.isEmpty

  def addMany(other: MMap[String, MMap[String, Double]]) = {
    other.foreach { case (k,v) =>
      this.add((k,v))
    }
  }

  override def merge(other: AccumulatorV2[(String, MMap[String, Double]), Map[String, Map[String, Double]]]): Unit = {
    other match {
      case o: StringMapStringDoubleAccumulator =>
        addMany(o.mmap)
      case _ => throw new Exception("Wrong StringMapStringDouble merge")
    }
  }
}

Source File: package.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import java.util.Collections

import scala.collection.JavaConverters._

import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeFormatter, CodegenContext, ExprCode}
import org.apache.spark.sql.catalyst.plans.physical.Partitioning
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.util.{AccumulatorV2, LongAccumulator}


    case class ColumnMetrics() {
      val elementTypes = new SetAccumulator[String]
      sparkContext.register(elementTypes)
    }

    val tupleCount: LongAccumulator = sparkContext.longAccumulator

    val numColumns: Int = child.output.size
    val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())

    def dumpStats(): Unit = {
      debugPrint(s"== ${child.simpleString} ==")
      debugPrint(s"Tuples output: ${tupleCount.value}")
      child.output.zip(columnStats).foreach { case (attr, metric) =>
        // This is called on driver. All accumulator updates have a fixed value. So it's safe to use
        // `asScala` which accesses the internal values using `java.util.Iterator`.
        val actualDataTypes = metric.elementTypes.value.asScala.mkString("{", ",", "}")
        debugPrint(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
      }
    }

    protected override def doExecute(): RDD[InternalRow] = {
      child.execute().mapPartitions { iter =>
        new Iterator[InternalRow] {
          def hasNext: Boolean = iter.hasNext

          def next(): InternalRow = {
            val currentRow = iter.next()
            tupleCount.add(1)
            var i = 0
            while (i < numColumns) {
              val value = currentRow.get(i, output(i).dataType)
              if (value != null) {
                columnStats(i).elementTypes.add(value.getClass.getName)
              }
              i += 1
            }
            currentRow
          }
        }
      }
    }

    override def outputPartitioning: Partitioning = child.outputPartitioning

    override def inputRDDs(): Seq[RDD[InternalRow]] = {
      child.asInstanceOf[CodegenSupport].inputRDDs()
    }

    override def doProduce(ctx: CodegenContext): String = {
      child.asInstanceOf[CodegenSupport].produce(ctx, this)
    }

    override def doConsume(ctx: CodegenContext, input: Seq[ExprCode], row: ExprCode): String = {
      consume(ctx, input)
    }
  }
}

Source File: ExternalClusterManagerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
}

Source File: TaskResult.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
}

Source File: EventTimeWatermarkExec.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.SparkPlan
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends SparkPlan {

  val eventTimeStats = new EventTimeStatsAccum()
  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
          .withMetadata(a.metadata)
          .putLong(EventTimeWatermark.delayKey, delay.milliseconds)
          .build()

      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }

  override def children: Seq[SparkPlan] = child :: Nil
}

Source File: SQLMetrics.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.metric

import java.text.NumberFormat
import java.util.Locale

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.AccumulableInfo
import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils}


class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] {
  // This is a workaround for SPARK-11013.
  // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will
  // update it at the end of task and the value will be at least 0. Then we can filter out the -1
  // values before calculate max, min, etc.
  private[this] var _value = initValue
  private var _zeroValue = initValue

  override def copy(): SQLMetric = {
    val newAcc = new SQLMetric(metricType, _value)
    newAcc._zeroValue = initValue
    newAcc
  }

  override def reset(): Unit = _value = _zeroValue

  override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
    case o: SQLMetric => _value += o.value
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def isZero(): Boolean = _value == _zeroValue

  override def add(v: Long): Unit = _value += v

  def +=(v: Long): Unit = _value += v

  override def value: Long = _value

  // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later
  override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
    new AccumulableInfo(
      id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER))
  }
}


object SQLMetrics {
  private val SUM_METRIC = "sum"
  private val SIZE_METRIC = "size"
  private val TIMING_METRIC = "timing"

  def createMetric(sc: SparkContext, name: String): SQLMetric = {
    val acc = new SQLMetric(SUM_METRIC)
    acc.register(sc, name = Some(name), countFailedValues = false)
    acc
  }

  
  def stringValue(metricsType: String, values: Seq[Long]): String = {
    if (metricsType == SUM_METRIC) {
      val numberFormat = NumberFormat.getIntegerInstance(Locale.US)
      numberFormat.format(values.sum)
    } else {
      val strFormat: Long => String = if (metricsType == SIZE_METRIC) {
        Utils.bytesToString
      } else if (metricsType == TIMING_METRIC) {
        Utils.msDurationToString
      } else {
        throw new IllegalStateException("unexpected metrics type: " + metricsType)
      }

      val validValues = values.filter(_ >= 0)
      val Seq(sum, min, med, max) = {
        val metric = if (validValues.isEmpty) {
          Seq.fill(4)(0L)
        } else {
          val sorted = validValues.sorted
          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
        }
        metric.map(strFormat)
      }
      s"\n$sum ($min, $med, $max)"
    }
  }
}

Source File: MapAccumulator.scala From gemini with GNU General Public License v3.0

5 votes

package tech.sourced.gemini.util

import org.apache.spark.util.AccumulatorV2
import scala.collection.mutable

class MapAccumulator extends AccumulatorV2[(String, Int), Map[String, Int]] {

  private val underlyingMap: mutable.HashMap[String, Int] = mutable.HashMap.empty
  override def isZero: Boolean = underlyingMap.isEmpty

  override def copy(): AccumulatorV2[(String, Int), Map[String, Int]] = {
    val newMapAccumulator = new MapAccumulator()
    underlyingMap.foreach(newMapAccumulator.add)
    newMapAccumulator
  }

  override def reset(): Unit = underlyingMap.clear

  override def value: Map[String, Int] = underlyingMap.toMap

  override def add(kv: (String, Int)): Unit = {
    val (k, v) = kv
    underlyingMap += k -> (underlyingMap.getOrElse(k, 0) + v)
  }

  override def merge(other: AccumulatorV2[(String, Int), Map[String, Int]]): Unit =
    other match {
      case map: MapAccumulator =>
        map.value.foreach(this.add)
      case _ =>
        throw new UnsupportedOperationException(
          s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
    }
}

Source File: ExceptionCountAccumulator.scala From spark-distcp with Apache License 2.0

5 votes

package com.coxautodata.objects

import java.util
import java.util.Collections
import java.util.function.{BiConsumer, BiFunction}

import org.apache.spark.util.AccumulatorV2

class ExceptionCountAccumulator extends AccumulatorV2[String, java.util.Map[String, Long]] {

  private val _map: java.util.Map[String, Long] = Collections.synchronizedMap(new util.HashMap[String, Long]())

  override def isZero: Boolean = _map.isEmpty

  override def copyAndReset(): ExceptionCountAccumulator = new ExceptionCountAccumulator

  override def copy(): ExceptionCountAccumulator = {
    val newAcc = new ExceptionCountAccumulator
    _map.synchronized {
      newAcc._map.putAll(_map)
    }
    newAcc
  }

  override def reset(): Unit = _map.clear()

  def add(e: Throwable): Unit = add(e.getClass.getName.stripSuffix("$"))

  override def add(k: String): Unit = {
    add(k, 1)
  }

  private def add(k: String, v: Long): Unit = {
    _map.merge(k, v, CombineCounts)
  }

  override def merge(other: AccumulatorV2[String, util.Map[String, Long]]): Unit = {
    other match {
      case e: ExceptionCountAccumulator =>
        e._map.forEach {
          new BiConsumer[String, Long] {
            override def accept(k: String, v: Long): Unit = add(k, v)
          }
        }
      case _ => throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
    }
  }

  override def value: util.Map[String, Long] = _map
}

object CombineCounts extends BiFunction[Long, Long, Long] {
  override def apply(t: Long, u: Long): Long = t + u
}

Source File: Histogram.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.spark.accumulator

import org.apache.spark.SparkContext
import org.apache.spark.util.AccumulatorV2

import scala.collection.immutable.SortedMap
import scala.collection.mutable


case class Histogram[T: Ordering](var map: mutable.Map[T, Long] = mutable.Map.empty[T, Long])
  extends AccumulatorV2[T, SortedMap[T, Long]] {

  override def isZero: Boolean = map.isEmpty

  override def copy(): AccumulatorV2[T, SortedMap[T, Long]] =
    Histogram(map.clone())

  override def reset(): Unit = map = mutable.Map.empty[T, Long]

  override def add(k: T): Unit =
    map.update(
      k,
      map.getOrElse(k, 0L) + 1
    )

  override def merge(other: AccumulatorV2[T, SortedMap[T, Long]]): Unit =
    for {
      (k, v) ← other.value
    } {
      map.update(k, map.getOrElse(k, 0L) + v)
    }

  override def value: SortedMap[T, Long] = SortedMap(map.toSeq: _*)
}

object Histogram {
  def apply[T: Ordering](name: String)(implicit sc: SparkContext): Histogram[T] = {
    val a = Histogram[T]()
    sc.register(a, name)
    a
  }
}

Source File: EventTimeWatermarkExec.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.streaming

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.{Attribute, UnsafeProjection}
import org.apache.spark.sql.catalyst.plans.logical.EventTimeWatermark
import org.apache.spark.sql.execution.{SparkPlan, UnaryExecNode}
import org.apache.spark.sql.types.MetadataBuilder
import org.apache.spark.unsafe.types.CalendarInterval
import org.apache.spark.util.AccumulatorV2


case class EventTimeWatermarkExec(
    eventTime: Attribute,
    delay: CalendarInterval,
    child: SparkPlan) extends UnaryExecNode {

  val eventTimeStats = new EventTimeStatsAccum()
  val delayMs = EventTimeWatermark.getDelayMs(delay)

  sparkContext.register(eventTimeStats)

  override protected def doExecute(): RDD[InternalRow] = {
    child.execute().mapPartitions { iter =>
      val getEventTime = UnsafeProjection.create(eventTime :: Nil, child.output)
      iter.map { row =>
        eventTimeStats.add(getEventTime(row).getLong(0) / 1000)
        row
      }
    }
  }

  // Update the metadata on the eventTime column to include the desired delay.
  override val output: Seq[Attribute] = child.output.map { a =>
    if (a semanticEquals eventTime) {
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .putLong(EventTimeWatermark.delayKey, delayMs)
        .build()
      a.withMetadata(updatedMetadata)
    } else if (a.metadata.contains(EventTimeWatermark.delayKey)) {
      // Remove existing watermark
      val updatedMetadata = new MetadataBuilder()
        .withMetadata(a.metadata)
        .remove(EventTimeWatermark.delayKey)
        .build()
      a.withMetadata(updatedMetadata)
    } else {
      a
    }
  }
}

Source File: CacheInvalidateAccumulator.scala From tispark with Apache License 2.0

5 votes

package com.pingcap.tispark.accumulator

import java.util

import com.pingcap.tikv.event.CacheInvalidateEvent
import org.apache.spark.util.AccumulatorV2

import scala.collection.JavaConversions._


class CacheInvalidateAccumulator
    extends AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]] {
  private final val eventSet: util.Set[CacheInvalidateEvent] =
    new util.HashSet[CacheInvalidateEvent]

  override def isZero: Boolean = eventSet.isEmpty

  override def reset(): Unit = eventSet.clear()

  override def add(v: CacheInvalidateEvent): Unit =
    eventSet.synchronized {
      eventSet.add(v)
    }

  override def copy(): AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]] = {
    val accumulator = new CacheInvalidateAccumulator
    eventSet.synchronized {
      accumulator.eventSet.addAll(eventSet)
    }
    accumulator
  }

  override def merge(
      other: AccumulatorV2[CacheInvalidateEvent, Seq[CacheInvalidateEvent]]): Unit =
    eventSet.addAll(other.value)

  override def value: Seq[CacheInvalidateEvent] = eventSet.toList

  def remove(event: CacheInvalidateEvent): Boolean =
    eventSet.synchronized {
      eventSet.remove(event)
    }
}

Source File: CustomAccumulator.scala From HadoopLearning with MIT License

5 votes

package com.liumm.transform

import org.apache.commons.lang3.StringUtils
import org.apache.spark.util.AccumulatorV2


class CustomAccumulator extends AccumulatorV2[String, String] {

  var result = "" //默认值

  override def isZero: Boolean = {
    result == ""
  }

  override def copy(): AccumulatorV2[String, String] = {
    val customAccumulator = new CustomAccumulator()
    customAccumulator.result = this.result
    customAccumulator
  }

  override def reset(): Unit = {
    result = ""
  }

  override def add(v: String): Unit = {
    if (StringUtils.isNoneBlank(v)) {
      if (isZero) {
        result = v
      } else {
        result += "|" + v
      }
    }
  }

  override def merge(other: AccumulatorV2[String, String]): Unit = other match {
    case newAc: CustomAccumulator =>
      if (isZero) result = newAc.value
      else result += "|" + newAc.value
    case _ =>
      throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}"
      )
  }

  override def value: String = {
    result
  }
}

Source File: SparkSolrAccumulator.scala From spark-solr with Apache License 2.0

5 votes

package com.lucidworks.spark

import java.lang.Long

import org.apache.spark.util.AccumulatorV2

class SparkSolrAccumulator extends AccumulatorV2[java.lang.Long, java.lang.Long] {
  private var _count = 0L

  override def isZero: Boolean = _count == 0

  override def copy(): SparkSolrAccumulator = {
    val newAcc = new SparkSolrAccumulator
    newAcc._count = this._count
    newAcc
  }

  override def reset(): Unit = {
    _count = 0L
  }

  override def add(v: Long): Unit = {
    _count += v
  }

  def count: Long = _count

  override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
    case o: SparkSolrAccumulator =>
      _count += o.count
    case _ =>
      throw new UnsupportedOperationException(
        s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def value: Long = _count

  def inc(): Unit = _count += 1
}

Source File: ExternalClusterManagerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
}

Source File: TaskResult.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer

import scala.collection.mutable.ArrayBuffer

import org.apache.spark.SparkEnv
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.storage.BlockId
import org.apache.spark.util.{AccumulatorV2, Utils}

// Task result. Also contains updates to accumulator variables.
private[spark] sealed trait TaskResult[T]


  def value(resultSer: SerializerInstance = null): T = {
    if (valueObjectDeserialized) {
      valueObject
    } else {
      // This should not run when holding a lock because it may cost dozens of seconds for a large
      // value
      val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer
      valueObject = ser.deserialize(valueBytes)
      valueObjectDeserialized = true
      valueObject
    }
  }
}

Source File: SQLMetrics.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.execution.metric

import java.text.NumberFormat
import java.util.Locale

import org.apache.spark.SparkContext
import org.apache.spark.scheduler.AccumulableInfo
import org.apache.spark.util.{AccumulatorContext, AccumulatorV2, Utils}


class SQLMetric(val metricType: String, initValue: Long = 0L) extends AccumulatorV2[Long, Long] {
  // This is a workaround for SPARK-11013.
  // We may use -1 as initial value of the accumulator, if the accumulator is valid, we will
  // update it at the end of task and the value will be at least 0. Then we can filter out the -1
  // values before calculate max, min, etc.
  private[this] var _value = initValue
  private var _zeroValue = initValue

  override def copy(): SQLMetric = {
    val newAcc = new SQLMetric(metricType, _value)
    newAcc._zeroValue = initValue
    newAcc
  }

  override def reset(): Unit = _value = _zeroValue

  override def merge(other: AccumulatorV2[Long, Long]): Unit = other match {
    case o: SQLMetric => _value += o.value
    case _ => throw new UnsupportedOperationException(
      s"Cannot merge ${this.getClass.getName} with ${other.getClass.getName}")
  }

  override def isZero(): Boolean = _value == _zeroValue

  override def add(v: Long): Unit = _value += v

  def +=(v: Long): Unit = _value += v

  override def value: Long = _value

  // Provide special identifier as metadata so we can tell that this is a `SQLMetric` later
  override def toInfo(update: Option[Any], value: Option[Any]): AccumulableInfo = {
    new AccumulableInfo(
      id, name, update, value, true, true, Some(AccumulatorContext.SQL_ACCUM_IDENTIFIER))
  }
}


object SQLMetrics {
  private val SUM_METRIC = "sum"
  private val SIZE_METRIC = "size"
  private val TIMING_METRIC = "timing"

  def createMetric(sc: SparkContext, name: String): SQLMetric = {
    val acc = new SQLMetric(SUM_METRIC)
    acc.register(sc, name = Some(name), countFailedValues = false)
    acc
  }

  
  def stringValue(metricsType: String, values: Seq[Long]): String = {
    if (metricsType == SUM_METRIC) {
      val numberFormat = NumberFormat.getIntegerInstance(Locale.ENGLISH)
      numberFormat.format(values.sum)
    } else {
      val strFormat: Long => String = if (metricsType == SIZE_METRIC) {
        Utils.bytesToString
      } else if (metricsType == TIMING_METRIC) {
        Utils.msDurationToString
      } else {
        throw new IllegalStateException("unexpected metrics type: " + metricsType)
      }

      val validValues = values.filter(_ >= 0)
      val Seq(sum, min, med, max) = {
        val metric = if (validValues.isEmpty) {
          Seq.fill(4)(0L)
        } else {
          val sorted = validValues.sorted
          Seq(sorted.sum, sorted(0), sorted(validValues.length / 2), sorted(validValues.length - 1))
        }
        metric.map(strFormat)
      }
      s"\n$sum ($min, $med, $max)"
    }
  }
}

org.apache.spark.util.AccumulatorV2 Scala Examples