org.apache.spark.storage.BlockManagerId Scala Example

Source File: ExternalClusterManagerSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
}

Source File: BlockTransferService.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.network

import java.io.Closeable
import java.nio.ByteBuffer

import scala.concurrent.{Promise, Await, Future}
import scala.concurrent.duration.Duration

import org.apache.spark.Logging
import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer}
import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener}
import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel}

private[spark]
abstract class BlockTransferService extends ShuffleClient with Closeable with Logging {

  
  def uploadBlockSync(
      hostname: String,
      port: Int,
      execId: String,
      blockId: BlockId,
      blockData: ManagedBuffer,
      level: StorageLevel): Unit = {
    Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf)
  }
}

Source File: ExternalClusterManagerSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def schedulingMode: SchedulingMode = SchedulingMode.FIFO
  override def rootPool: Pool = new Pool("", schedulingMode, 0, 0)
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def killTaskAttempt(
    taskId: Long, interruptThread: Boolean, reason: String): Boolean = false
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def workerRemoved(workerId: String, host: String, message: String): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
}

Source File: MapStatusSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.storage.BlockManagerId

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.JavaSerializer

import scala.util.Random

class MapStatusSuite extends SparkFunSuite {

  test("compressSize") {//compress 压缩大小
    assert(MapStatus.compressSize(0L) === 0)
    assert(MapStatus.compressSize(1L) === 1)
    assert(MapStatus.compressSize(2L) === 8)
    assert(MapStatus.compressSize(10L) === 25)
    assert((MapStatus.compressSize(1000000L) & 0xFF) === 145)
    assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218)
    // This last size is bigger than we can encode in a byte, so check that we just return 255
    //这最后一个大小字节编码,所以检查返回255
    assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255)
  }

  test("decompressSize") {//解压缩的大小
    assert(MapStatus.decompressSize(0) === 0)
    for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) {
      val size2 = MapStatus.decompressSize(MapStatus.compressSize(size))
      assert(size2 >= 0.99 * size && size2 <= 1.11 * size,
        "size " + size + " decompressed to " + size2 + ", which is out of range")
    }
  }
  //MapStatus 不应该报告非空块的大小为0
  test("MapStatus should never report non-empty blocks' sizes as 0") {
    import Math._
    for (
      numSizes <- Seq(1, 10, 100, 1000, 10000);
      mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong);
      stddev <- Seq(0.0, 0.01, 0.5, 1.0)
    ) {
      val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean)
      val status = MapStatus(BlockManagerId("a", "b", 10), sizes)
      val status1 = compressAndDecompressMapStatus(status)
      for (i <- 0 until numSizes) {
        if (sizes(i) != 0) {
          val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev"
          assert(status.getSizeForBlock(i) !== 0, failureMessage)
          assert(status1.getSizeForBlock(i) !== 0, failureMessage)
        }
      }
    }
  }
//大型任务应该使用
  test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) {
    val sizes = Array.fill[Long](2001)(150L)
    val status = MapStatus(null, sizes)
    assert(status.isInstanceOf[HighlyCompressedMapStatus])
    assert(status.getSizeForBlock(10) === 150L)
    assert(status.getSizeForBlock(50) === 150L)
    assert(status.getSizeForBlock(99) === 150L)
    assert(status.getSizeForBlock(2000) === 150L)
  }
  //高度压缩的Map状态:估计的大小应该是平均非空块大小
  test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") {
    val sizes = Array.tabulate[Long](3000) { i => i.toLong }
    val avg = sizes.sum / sizes.filter(_ != 0).length
    val loc = BlockManagerId("a", "b", 10)
    val status = MapStatus(loc, sizes)
    val status1 = compressAndDecompressMapStatus(status)
    assert(status1.isInstanceOf[HighlyCompressedMapStatus])
    assert(status1.location == loc)
    for (i <- 0 until 3000) {
      val estimate = status1.getSizeForBlock(i)
      if (sizes(i) > 0) {
        assert(estimate === avg)
      }
    }
  }

  def compressAndDecompressMapStatus(status: MapStatus): MapStatus = {
    val ser = new JavaSerializer(new SparkConf)
    val buf = ser.newInstance().serialize(status)
    ser.newInstance().deserialize[MapStatus](buf)
  }
}

Source File: ShuffleMapStage.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.ShuffleDependency
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.CallSite


  def removeOutputsOnExecutor(execId: String): Unit = {
    var becameUnavailable = false
    for (partition <- 0 until numPartitions) {
      val prevList = outputLocs(partition)
      val newList = prevList.filterNot(_.location.executorId == execId)
      outputLocs(partition) = newList
      //Nil是一个空的List,::向队列的头部追加数据,创造新的列表
      if (prevList != Nil && newList == Nil) {
        becameUnavailable = true
        numAvailableOutputs -= 1
      }
    }
    if (becameUnavailable) {
      logInfo("%s is now unavailable on executor %s (%d/%d, %s)".format(
        this, execId, numAvailableOutputs, numPartitions, isAvailable))
    }
  }
}

Source File: HeartbeatReceiverSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark

import scala.concurrent.duration._
import scala.language.postfixOps

import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.BlockManagerId
import org.mockito.Mockito.{mock, spy, verify, when}
import org.mockito.Matchers
import org.mockito.Matchers._

import org.apache.spark.scheduler.TaskScheduler
import org.apache.spark.util.RpcUtils
import org.scalatest.concurrent.Eventually._

class HeartbeatReceiverSuite extends SparkFunSuite with LocalSparkContext {

  test("HeartbeatReceiver") {
    sc = spy(new SparkContext("local[2]", "test"))
    val scheduler = mock(classOf[TaskScheduler])
    when(scheduler.executorHeartbeatReceived(any(), any(), any())).thenReturn(true)
    when(sc.taskScheduler).thenReturn(scheduler)

    val heartbeatReceiver = new HeartbeatReceiver(sc)
    sc.env.rpcEnv.setupEndpoint("heartbeat", heartbeatReceiver).send(TaskSchedulerIsSet)
    eventually(timeout(5 seconds), interval(5 millis)) {
      assert(heartbeatReceiver.scheduler != null)
    }
    val receiverRef = RpcUtils.makeDriverRef("heartbeat", sc.conf, sc.env.rpcEnv)

    val metrics = new TaskMetrics
    val blockManagerId = BlockManagerId("executor-1", "localhost", 12345)
    val response = receiverRef.askWithRetry[HeartbeatResponse](
      Heartbeat("executor-1", Array(1L -> metrics), blockManagerId))

    verify(scheduler).executorHeartbeatReceived(
      Matchers.eq("executor-1"), Matchers.eq(Array(1L -> metrics)), Matchers.eq(blockManagerId))
    assert(false === response.reregisterBlockManager)
  }

  test("HeartbeatReceiver re-register") {
    sc = spy(new SparkContext("local[2]", "test"))
    val scheduler = mock(classOf[TaskScheduler])
    when(scheduler.executorHeartbeatReceived(any(), any(), any())).thenReturn(false)
    when(sc.taskScheduler).thenReturn(scheduler)

    val heartbeatReceiver = new HeartbeatReceiver(sc)
    sc.env.rpcEnv.setupEndpoint("heartbeat", heartbeatReceiver).send(TaskSchedulerIsSet)
    eventually(timeout(5 seconds), interval(5 millis)) {
      assert(heartbeatReceiver.scheduler != null)
    }
    val receiverRef = RpcUtils.makeDriverRef("heartbeat", sc.conf, sc.env.rpcEnv)

    val metrics = new TaskMetrics
    val blockManagerId = BlockManagerId("executor-1", "localhost", 12345)
    val response = receiverRef.askWithRetry[HeartbeatResponse](
      Heartbeat("executor-1", Array(1L -> metrics), blockManagerId))

    verify(scheduler).executorHeartbeatReceived(
      Matchers.eq("executor-1"), Matchers.eq(Array(1L -> metrics)), Matchers.eq(blockManagerId))
    assert(true === response.reregisterBlockManager)
  }
}

Source File: MapStatusSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.storage.BlockManagerId

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.JavaSerializer

import scala.util.Random

class MapStatusSuite extends SparkFunSuite {

  test("compressSize") {
    assert(MapStatus.compressSize(0L) === 0)
    assert(MapStatus.compressSize(1L) === 1)
    assert(MapStatus.compressSize(2L) === 8)
    assert(MapStatus.compressSize(10L) === 25)
    assert((MapStatus.compressSize(1000000L) & 0xFF) === 145)
    assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218)
    // This last size is bigger than we can encode in a byte, so check that we just return 255
    assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255)
  }

  test("decompressSize") {
    assert(MapStatus.decompressSize(0) === 0)
    for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) {
      val size2 = MapStatus.decompressSize(MapStatus.compressSize(size))
      assert(size2 >= 0.99 * size && size2 <= 1.11 * size,
        "size " + size + " decompressed to " + size2 + ", which is out of range")
    }
  }

  test("MapStatus should never report non-empty blocks' sizes as 0") {
    import Math._
    for (
      numSizes <- Seq(1, 10, 100, 1000, 10000);
      mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong);
      stddev <- Seq(0.0, 0.01, 0.5, 1.0)
    ) {
      val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean)
      val status = MapStatus(BlockManagerId("a", "b", 10), sizes)
      val status1 = compressAndDecompressMapStatus(status)
      for (i <- 0 until numSizes) {
        if (sizes(i) != 0) {
          val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev"
          assert(status.getSizeForBlock(i) !== 0, failureMessage)
          assert(status1.getSizeForBlock(i) !== 0, failureMessage)
        }
      }
    }
  }

  test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) {
    val sizes = Array.fill[Long](2001)(150L)
    val status = MapStatus(null, sizes)
    assert(status.isInstanceOf[HighlyCompressedMapStatus])
    assert(status.getSizeForBlock(10) === 150L)
    assert(status.getSizeForBlock(50) === 150L)
    assert(status.getSizeForBlock(99) === 150L)
    assert(status.getSizeForBlock(2000) === 150L)
  }

  test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") {
    val sizes = Array.tabulate[Long](3000) { i => i.toLong }
    val avg = sizes.sum / sizes.filter(_ != 0).length
    val loc = BlockManagerId("a", "b", 10)
    val status = MapStatus(loc, sizes)
    val status1 = compressAndDecompressMapStatus(status)
    assert(status1.isInstanceOf[HighlyCompressedMapStatus])
    assert(status1.location == loc)
    for (i <- 0 until 3000) {
      val estimate = status1.getSizeForBlock(i)
      if (sizes(i) > 0) {
        assert(estimate === avg)
      }
    }
  }

  def compressAndDecompressMapStatus(status: MapStatus): MapStatus = {
    val ser = new JavaSerializer(new SparkConf)
    val buf = ser.newInstance().serialize(status)
    ser.newInstance().deserialize[MapStatus](buf)
  }
}

Source File: ShuffleMapStage.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.ShuffleDependency
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.CallSite


  def removeOutputsOnExecutor(execId: String): Unit = {
    var becameUnavailable = false
    for (partition <- 0 until numPartitions) {
      val prevList = outputLocs(partition)
      val newList = prevList.filterNot(_.location.executorId == execId)
      outputLocs(partition) = newList
      if (prevList != Nil && newList == Nil) {
        becameUnavailable = true
        numAvailableOutputs -= 1
      }
    }
    if (becameUnavailable) {
      logInfo("%s is now unavailable on executor %s (%d/%d, %s)".format(
        this, execId, numAvailableOutputs, numPartitions, isAvailable))
    }
  }
}

Source File: BlockStoreShuffleFetcher.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.shuffle.hash

import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import scala.util.{Failure, Success, Try}

import org.apache.spark._
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle.FetchFailedException
import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId}
import org.apache.spark.util.CompletionIterator

private[hash] object BlockStoreShuffleFetcher extends Logging {
  def fetch[T](
      shuffleId: Int,
      reduceId: Int,
      context: TaskContext,
      serializer: Serializer)
    : Iterator[T] =
  {
    logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId))
    val blockManager = SparkEnv.get.blockManager

    val startTime = System.currentTimeMillis
    val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId)
    logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format(
      shuffleId, reduceId, System.currentTimeMillis - startTime))

    val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]]
    for (((address, size), index) <- statuses.zipWithIndex) {
      splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size))
    }

    val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map {
      case (address, splits) =>
        (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2)))
    }

    def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = {
      val blockId = blockPair._1
      val blockOption = blockPair._2
      blockOption match {
        case Success(block) => {
          block.asInstanceOf[Iterator[T]]
        }
        case Failure(e) => {
          blockId match {
            case ShuffleBlockId(shufId, mapId, _) =>
              val address = statuses(mapId.toInt)._1
              throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e)
            case _ =>
              throw new SparkException(
                "Failed to get block " + blockId + ", which is not a shuffle block", e)
          }
        }
      }
    }

    val blockFetcherItr = new ShuffleBlockFetcherIterator(
      context,
      SparkEnv.get.blockManager.shuffleClient,
      blockManager,
      blocksByAddress,
      serializer,
      // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
      SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024)
    val itr = blockFetcherItr.flatMap(unpackBlock)

    val completionIter = CompletionIterator[T, Iterator[T]](itr, {
      context.taskMetrics.updateShuffleReadMetrics()
    })

    new InterruptibleIterator[T](context, completionIter) {
      val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency()
      override def next(): T = {
        readMetrics.incRecordsRead(1)
        delegate.next()
      }
    }
  }
}

Source File: BlockTransferService.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.network

import java.io.Closeable
import java.nio.ByteBuffer

import scala.concurrent.{Promise, Await, Future}
import scala.concurrent.duration.Duration

import org.apache.spark.Logging
import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer}
import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener}
import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel}

private[spark]
abstract class BlockTransferService extends ShuffleClient with Closeable with Logging {

  
  def uploadBlockSync(
      hostname: String,
      port: Int,
      execId: String,
      blockId: BlockId,
      blockData: ManagedBuffer,
      level: StorageLevel): Unit = {
    Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf)
  }
}

Source File: ShuffleMapTask.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.lang.management.ManagementFactory
import java.nio.ByteBuffer
import java.util.Properties

import scala.language.existentials

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.shuffle.ShuffleWriter
import org.apache.spark.storage.BlockManagerId


  def this(partitionId: Int) {
    this(0, 0, null, new Partition { override def index: Int = 0 }, null, new Properties, null)
  }

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  var rdd: RDD[_] = null
  var dep: ShuffleDependency[_, _, _] = null

  override def prepTask(): Unit = {
    // Deserialize the RDD using the broadcast variable.
    val threadMXBean = ManagementFactory.getThreadMXBean
    val deserializeStartTime = System.currentTimeMillis()
    val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime
    } else 0L
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (_rdd, _dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])](
      ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader)
     rdd = _rdd
     dep = _dep
    _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime
    _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) {
      threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime
    } else 0L
  }

  override def runTask(context: TaskContext): MapStatus = {
    if (dep == null || rdd == null) {
      prepTask()
    }

    var writer: ShuffleWriter[Any, Any] = null
    try {
      val manager = SparkEnv.get.shuffleManager
      writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context)
      writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]])
      val status = writer.stop(success = true).get
      FutureTaskNotifier.taskCompleted(status, partitionId, dep.shuffleId,
        dep.partitioner.numPartitions, nextStageLocs, metrics.shuffleWriteMetrics, false)
      status
    } catch {
      case e: Exception =>
        try {
          if (writer != null) {
            writer.stop(success = false)
          }
        } catch {
          case e: Exception =>
            log.debug("Could not stop writer", e)
        }
        throw e
    }
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId)
}

object ShuffleMapTask {

  def apply(
      stageId: Int,
      stageAttemptId: Int,
      partition: Partition,
      properties: Properties,
      internalAccumulatorsSer: Array[Byte],
      isFutureTask: Boolean,
      rdd: RDD[_],
      dep: ShuffleDependency[_, _, _],
      nextStageLocs: Option[Seq[BlockManagerId]]): ShuffleMapTask = {

    val smt = new ShuffleMapTask(stageId, stageAttemptId, null, partition, null,
      properties, internalAccumulatorsSer, isFutureTask, nextStageLocs)

    smt.rdd = rdd
    smt.dep = dep
    smt
  }
}

Source File: MapStatusSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.storage.BlockManagerId
import org.scalatest.FunSuite

import org.apache.spark.SparkConf
import org.apache.spark.serializer.JavaSerializer

import scala.util.Random

class MapStatusSuite extends FunSuite {

  test("compressSize") {
    assert(MapStatus.compressSize(0L) === 0)
    assert(MapStatus.compressSize(1L) === 1)
    assert(MapStatus.compressSize(2L) === 8)
    assert(MapStatus.compressSize(10L) === 25)
    assert((MapStatus.compressSize(1000000L) & 0xFF) === 145)
    assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218)
    // This last size is bigger than we can encode in a byte, so check that we just return 255
    assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255)
  }

  test("decompressSize") {
    assert(MapStatus.decompressSize(0) === 0)
    for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) {
      val size2 = MapStatus.decompressSize(MapStatus.compressSize(size))
      assert(size2 >= 0.99 * size && size2 <= 1.11 * size,
        "size " + size + " decompressed to " + size2 + ", which is out of range")
    }
  }

  test("MapStatus should never report non-empty blocks' sizes as 0") {
    import Math._
    for (
      numSizes <- Seq(1, 10, 100, 1000, 10000);
      mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong);
      stddev <- Seq(0.0, 0.01, 0.5, 1.0)
    ) {
      val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean)
      val status = MapStatus(BlockManagerId("a", "b", 10), sizes)
      val status1 = compressAndDecompressMapStatus(status)
      for (i <- 0 until numSizes) {
        if (sizes(i) != 0) {
          val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev"
          assert(status.getSizeForBlock(i) !== 0, failureMessage)
          assert(status1.getSizeForBlock(i) !== 0, failureMessage)
        }
      }
    }
  }

  test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) {
    val sizes = Array.fill[Long](2001)(150L)
    val status = MapStatus(null, sizes)
    assert(status.isInstanceOf[HighlyCompressedMapStatus])
    assert(status.getSizeForBlock(10) === 150L)
    assert(status.getSizeForBlock(50) === 150L)
    assert(status.getSizeForBlock(99) === 150L)
    assert(status.getSizeForBlock(2000) === 150L)
  }

  test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") {
    val sizes = Array.tabulate[Long](3000) { i => i.toLong }
    val avg = sizes.sum / sizes.filter(_ != 0).length
    val loc = BlockManagerId("a", "b", 10)
    val status = MapStatus(loc, sizes)
    val status1 = compressAndDecompressMapStatus(status)
    assert(status1.isInstanceOf[HighlyCompressedMapStatus])
    assert(status1.location == loc)
    for (i <- 0 until 3000) {
      val estimate = status1.getSizeForBlock(i)
      if (sizes(i) > 0) {
        assert(estimate === avg)
      }
    }
  }

  def compressAndDecompressMapStatus(status: MapStatus): MapStatus = {
    val ser = new JavaSerializer(new SparkConf)
    val buf = ser.newInstance().serialize(status)
    ser.newInstance().deserialize[MapStatus](buf)
  }
}

Source File: BlockStoreShuffleFetcher.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.shuffle.hash

import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import scala.util.{Failure, Success, Try}

import org.apache.spark._
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle.FetchFailedException
import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId}
import org.apache.spark.util.CompletionIterator

private[hash] object BlockStoreShuffleFetcher extends Logging {
  def fetch[T](
      shuffleId: Int,
      reduceId: Int,
      context: TaskContext,
      serializer: Serializer)
    : Iterator[T] =
  {
    logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId))
    val blockManager = SparkEnv.get.blockManager

    val startTime = System.currentTimeMillis
    val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId)
    logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format(
      shuffleId, reduceId, System.currentTimeMillis - startTime))

    val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]]
    for (((address, size), index) <- statuses.zipWithIndex) {
      splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size))
    }

    val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map {
      case (address, splits) =>
        (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2)))
    }

    def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = {
      val blockId = blockPair._1
      val blockOption = blockPair._2
      blockOption match {
        case Success(block) => {
          block.asInstanceOf[Iterator[T]]
        }
        case Failure(e) => {
          blockId match {
            case ShuffleBlockId(shufId, mapId, _) =>
              val address = statuses(mapId.toInt)._1
              throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e)
            case _ =>
              throw new SparkException(
                "Failed to get block " + blockId + ", which is not a shuffle block", e)
          }
        }
      }
    }

    val blockFetcherItr = new ShuffleBlockFetcherIterator(
      context,
      SparkEnv.get.blockManager.shuffleClient,
      blockManager,
      blocksByAddress,
      serializer,
      SparkEnv.get.conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024)
    val itr = blockFetcherItr.flatMap(unpackBlock)

    val completionIter = CompletionIterator[T, Iterator[T]](itr, {
      context.taskMetrics.updateShuffleReadMetrics()
    })

    new InterruptibleIterator[T](context, completionIter) {
      val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency()
      override def next(): T = {
        readMetrics.incRecordsRead(1)
        delegate.next()
      }
    }
  }
}

Source File: BlockTransferService.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.network

import java.io.Closeable
import java.nio.ByteBuffer

import scala.concurrent.{Promise, Await, Future}
import scala.concurrent.duration.Duration

import org.apache.spark.Logging
import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer}
import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener}
import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel}

private[spark]
abstract class BlockTransferService extends ShuffleClient with Closeable with Logging {

  
  def uploadBlockSync(
      hostname: String,
      port: Int,
      execId: String,
      blockId: BlockId,
      blockData: ManagedBuffer,
      level: StorageLevel): Unit = {
    Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf)
  }
}

Source File: ExternalClusterManagerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
}

Source File: OapMessages.scala From OAP with Apache License 2.0

5 votes

package org.apache.spark.sql.oap.rpc

import org.apache.spark.rpc.RpcEndpointRef
import org.apache.spark.storage.BlockManagerId

private[spark] object OapMessages {

  sealed trait OapMessage extends Serializable

  sealed trait ToOapRpcManagerSlave extends OapMessage
  sealed trait ToOapRpcManagerMaster extends OapMessage
  sealed trait Heartbeat extends ToOapRpcManagerMaster

  
  case class RegisterOapRpcManager(
      executorId: String, oapRpcManagerEndpoint: RpcEndpointRef) extends ToOapRpcManagerMaster
  case class DummyHeartbeat(someContent: String) extends Heartbeat
  case class FiberCacheHeartbeat(
      executorId: String, blockManagerId: BlockManagerId, content: String) extends Heartbeat
  case class FiberCacheMetricsHeartbeat(
      executorId: String, blockManagerId: BlockManagerId, content: String) extends Heartbeat
}

Source File: ExternalClusterManagerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.SchedulingMode.SchedulingMode
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.util.AccumulatorV2

class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext {
  test("launch of backend and scheduler") {
    val conf = new SparkConf().setMaster("myclusterManager").
        setAppName("testcm").set("spark.driver.allowMultipleContexts", "true")
    sc = new SparkContext(conf)
    // check if the scheduler components are created and initialized
    sc.schedulerBackend match {
      case dummy: DummySchedulerBackend => assert(dummy.initialized)
      case other => fail(s"wrong scheduler backend: ${other}")
    }
    sc.taskScheduler match {
      case dummy: DummyTaskScheduler => assert(dummy.initialized)
      case other => fail(s"wrong task scheduler: ${other}")
    }
  }
}


private class DummyExternalClusterManager extends ExternalClusterManager {

  def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager"

  def createTaskScheduler(sc: SparkContext,
      masterURL: String): TaskScheduler = new DummyTaskScheduler

  def createSchedulerBackend(sc: SparkContext,
      masterURL: String,
      scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend()

  def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = {
    scheduler.asInstanceOf[DummyTaskScheduler].initialized = true
    backend.asInstanceOf[DummySchedulerBackend].initialized = true
  }

}

private class DummySchedulerBackend extends SchedulerBackend {
  var initialized = false
  def start() {}
  def stop() {}
  def reviveOffers() {}
  def defaultParallelism(): Int = 1
}

private class DummyTaskScheduler extends TaskScheduler {
  var initialized = false
  override def rootPool: Pool = null
  override def schedulingMode: SchedulingMode = SchedulingMode.NONE
  override def start(): Unit = {}
  override def stop(): Unit = {}
  override def submitTasks(taskSet: TaskSet): Unit = {}
  override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {}
  override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {}
  override def defaultParallelism(): Int = 2
  override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {}
  override def applicationAttemptId(): Option[String] = None
  def executorHeartbeatReceived(
      execId: String,
      accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])],
      blockManagerId: BlockManagerId): Boolean = true
}

Source File: BatchShuffleMapTask.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io._
import java.nio.ByteBuffer
import java.util.Properties

import scala.reflect.ClassTag

import org.apache.spark._
import org.apache.spark.broadcast.Broadcast
import org.apache.spark.internal.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.BlockManagerId

private[spark] class BatchShuffleMapTask(
    stageId: Int,
    stageAttemptId: Int,
    taskBinaries: Broadcast[Array[Byte]],
    partitions: Array[Partition],
    partitionId: Int,
    @transient private var locs: Seq[TaskLocation],
    internalAccumulatorsSer: Array[Byte],
    localProperties: Properties,
    isFutureTask: Boolean,
    nextStageLocs: Option[Seq[BlockManagerId]] = None,
    depShuffleIds: Option[Seq[Seq[Int]]] = None,
    depShuffleNumMaps: Option[Seq[Int]] = None,
    jobId: Option[Int] = None,
    appId: Option[String] = None,
    appAttemptId: Option[String] = None)
  extends Task[Array[MapStatus]](stageId, stageAttemptId, partitionId,
    internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps,
    jobId, appId, appAttemptId)
  with BatchTask
  with Logging {

  @transient private val preferredLocs: Seq[TaskLocation] = {
    if (locs == null) Nil else locs.toSet.toSeq
  }

  var rdds: Array[RDD[_]] = null
  var deps: Array[ShuffleDependency[_, _, _]] = null

  override def prepTask(): Unit = {
    // Deserialize the RDD using the broadcast variable.
    val ser = SparkEnv.get.closureSerializer.newInstance()
    val (rddI, depI) = ser.deserialize[(Array[RDD[_]], Array[ShuffleDependency[_, _, _]])](
      ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader)
    rdds = rddI
    deps = depI
  }

  def getTasks(): Seq[Task[Any]] = {
    if (deps == null || rdds == null) {
      prepTask()
    }

    (0 until partitions.length).map { i =>
      val s = ShuffleMapTask(stageId, stageAttemptId, partitions(i), localProperties,
        internalAccumulatorsSer, isFutureTask, rdds(i), deps(i), nextStageLocs)
      s.epoch = epoch
      s
    }.map(_.asInstanceOf[Task[Any]])
  }

  override def runTask(context: TaskContext): Array[MapStatus] = {
    throw new RuntimeException("BatchShuffleMapTasks should not be run!")
  }

  override def preferredLocations: Seq[TaskLocation] = preferredLocs

  override def toString: String = "BatchShuffleMapTask(%d, %d)".format(stageId, partitionId)
}

Source File: FutureTaskNotifier.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark._
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.internal.Logging
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.storage.ShuffleBlockId
import org.apache.spark.storage.StorageLevel


private[spark] object FutureTaskNotifier extends Logging {

  def taskCompleted(
      status: MapStatus,
      mapId: Int,
      shuffleId: Int,
      numReduces: Int,
      nextStageLocs: Option[Seq[BlockManagerId]],
      shuffleWriteMetrics: ShuffleWriteMetrics,
      skipZeroByteNotifications: Boolean): Unit = {
    if (!nextStageLocs.isEmpty && numReduces == nextStageLocs.get.length) {
      val drizzleRpcsStart = System.nanoTime
      sendMapStatusToNextTaskLocations(status, mapId, shuffleId, numReduces, nextStageLocs,
        skipZeroByteNotifications)
      shuffleWriteMetrics.incWriteTime(System.nanoTime -
        drizzleRpcsStart)
    } else {
      logInfo(
        s"No taskCompletion next: ${nextStageLocs.map(_.length).getOrElse(0)} r: $numReduces")
    }
  }

  // Push metadata saying that this map task finished, so that the tasks in the next stage
  // know they can begin pulling the data.
  private def sendMapStatusToNextTaskLocations(
      status: MapStatus,
      mapId: Int,
      shuffleId: Int,
      numReduces: Int,
      nextStageLocs: Option[Seq[BlockManagerId]],
      skipZeroByteNotifications: Boolean) {
    val numReduces = nextStageLocs.get.length
    val uniqueLocations = if (skipZeroByteNotifications) {
      nextStageLocs.get.zipWithIndex.filter { x =>
        status.getSizeForBlock(x._2) != 0L
      }.map(_._1).toSet
    } else {
      nextStageLocs.get.toSet
    }
    uniqueLocations.foreach { blockManagerId =>
      try {
        SparkEnv.get.blockManager.blockTransferService.mapOutputReady(
          blockManagerId.host,
          blockManagerId.port,
          shuffleId,
          mapId,
          numReduces,
          status)
      } catch {
        case e: Exception =>
          logWarning(s"Failed to send map outputs to $blockManagerId", e)
      }
    }
  }

}

org.apache.spark.storage.BlockManagerId Scala Examples