org.apache.spark.storage.BlockManagerId Scala Examples
The following examples show how to use org.apache.spark.storage.BlockManagerId.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: ExternalClusterManagerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext { test("launch of backend and scheduler") { val conf = new SparkConf().setMaster("myclusterManager"). setAppName("testcm").set("spark.driver.allowMultipleContexts", "true") sc = new SparkContext(conf) // check if the scheduler components are created and initialized sc.schedulerBackend match { case dummy: DummySchedulerBackend => assert(dummy.initialized) case other => fail(s"wrong scheduler backend: ${other}") } sc.taskScheduler match { case dummy: DummyTaskScheduler => assert(dummy.initialized) case other => fail(s"wrong task scheduler: ${other}") } } } private class DummyExternalClusterManager extends ExternalClusterManager { def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager" def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = new DummyTaskScheduler def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend() def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[DummyTaskScheduler].initialized = true backend.asInstanceOf[DummySchedulerBackend].initialized = true } } private class DummySchedulerBackend extends SchedulerBackend { var initialized = false def start() {} def stop() {} def reviveOffers() {} def defaultParallelism(): Int = 1 } private class DummyTaskScheduler extends TaskScheduler { var initialized = false override def rootPool: Pool = null override def schedulingMode: SchedulingMode = SchedulingMode.NONE override def start(): Unit = {} override def stop(): Unit = {} override def submitTasks(taskSet: TaskSet): Unit = {} override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {} override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def applicationAttemptId(): Option[String] = None def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId): Boolean = true }
Example 2
Source File: BlockTransferService.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 3
Source File: ExternalClusterManagerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext { test("launch of backend and scheduler") { val conf = new SparkConf().setMaster("myclusterManager"). setAppName("testcm").set("spark.driver.allowMultipleContexts", "true") sc = new SparkContext(conf) // check if the scheduler components are created and initialized sc.schedulerBackend match { case dummy: DummySchedulerBackend => assert(dummy.initialized) case other => fail(s"wrong scheduler backend: ${other}") } sc.taskScheduler match { case dummy: DummyTaskScheduler => assert(dummy.initialized) case other => fail(s"wrong task scheduler: ${other}") } } } private class DummyExternalClusterManager extends ExternalClusterManager { def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager" def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = new DummyTaskScheduler def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend() def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[DummyTaskScheduler].initialized = true backend.asInstanceOf[DummySchedulerBackend].initialized = true } } private class DummySchedulerBackend extends SchedulerBackend { var initialized = false def start() {} def stop() {} def reviveOffers() {} def defaultParallelism(): Int = 1 } private class DummyTaskScheduler extends TaskScheduler { var initialized = false override def schedulingMode: SchedulingMode = SchedulingMode.FIFO override def rootPool: Pool = new Pool("", schedulingMode, 0, 0) override def start(): Unit = {} override def stop(): Unit = {} override def submitTasks(taskSet: TaskSet): Unit = {} override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {} override def killTaskAttempt( taskId: Long, interruptThread: Boolean, reason: String): Boolean = false override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def workerRemoved(workerId: String, host: String, message: String): Unit = {} override def applicationAttemptId(): Option[String] = None def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId): Boolean = true }
Example 4
Source File: MapStatusSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.storage.BlockManagerId import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.serializer.JavaSerializer import scala.util.Random class MapStatusSuite extends SparkFunSuite { test("compressSize") {//compress 压缩大小 assert(MapStatus.compressSize(0L) === 0) assert(MapStatus.compressSize(1L) === 1) assert(MapStatus.compressSize(2L) === 8) assert(MapStatus.compressSize(10L) === 25) assert((MapStatus.compressSize(1000000L) & 0xFF) === 145) assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218) // This last size is bigger than we can encode in a byte, so check that we just return 255 //这最后一个大小字节编码,所以检查返回255 assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255) } test("decompressSize") {//解压缩的大小 assert(MapStatus.decompressSize(0) === 0) for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) { val size2 = MapStatus.decompressSize(MapStatus.compressSize(size)) assert(size2 >= 0.99 * size && size2 <= 1.11 * size, "size " + size + " decompressed to " + size2 + ", which is out of range") } } //MapStatus 不应该报告非空块的大小为0 test("MapStatus should never report non-empty blocks' sizes as 0") { import Math._ for ( numSizes <- Seq(1, 10, 100, 1000, 10000); mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong); stddev <- Seq(0.0, 0.01, 0.5, 1.0) ) { val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean) val status = MapStatus(BlockManagerId("a", "b", 10), sizes) val status1 = compressAndDecompressMapStatus(status) for (i <- 0 until numSizes) { if (sizes(i) != 0) { val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev" assert(status.getSizeForBlock(i) !== 0, failureMessage) assert(status1.getSizeForBlock(i) !== 0, failureMessage) } } } } //大型任务应该使用 test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) { val sizes = Array.fill[Long](2001)(150L) val status = MapStatus(null, sizes) assert(status.isInstanceOf[HighlyCompressedMapStatus]) assert(status.getSizeForBlock(10) === 150L) assert(status.getSizeForBlock(50) === 150L) assert(status.getSizeForBlock(99) === 150L) assert(status.getSizeForBlock(2000) === 150L) } //高度压缩的Map状态:估计的大小应该是平均非空块大小 test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") { val sizes = Array.tabulate[Long](3000) { i => i.toLong } val avg = sizes.sum / sizes.filter(_ != 0).length val loc = BlockManagerId("a", "b", 10) val status = MapStatus(loc, sizes) val status1 = compressAndDecompressMapStatus(status) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) assert(status1.location == loc) for (i <- 0 until 3000) { val estimate = status1.getSizeForBlock(i) if (sizes(i) > 0) { assert(estimate === avg) } } } def compressAndDecompressMapStatus(status: MapStatus): MapStatus = { val ser = new JavaSerializer(new SparkConf) val buf = ser.newInstance().serialize(status) ser.newInstance().deserialize[MapStatus](buf) } }
Example 5
Source File: ShuffleMapStage.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.ShuffleDependency import org.apache.spark.rdd.RDD import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.CallSite def removeOutputsOnExecutor(execId: String): Unit = { var becameUnavailable = false for (partition <- 0 until numPartitions) { val prevList = outputLocs(partition) val newList = prevList.filterNot(_.location.executorId == execId) outputLocs(partition) = newList //Nil是一个空的List,::向队列的头部追加数据,创造新的列表 if (prevList != Nil && newList == Nil) { becameUnavailable = true numAvailableOutputs -= 1 } } if (becameUnavailable) { logInfo("%s is now unavailable on executor %s (%d/%d, %s)".format( this, execId, numAvailableOutputs, numPartitions, isAvailable)) } } }
Example 6
Source File: HeartbeatReceiverSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark import scala.concurrent.duration._ import scala.language.postfixOps import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockManagerId import org.mockito.Mockito.{mock, spy, verify, when} import org.mockito.Matchers import org.mockito.Matchers._ import org.apache.spark.scheduler.TaskScheduler import org.apache.spark.util.RpcUtils import org.scalatest.concurrent.Eventually._ class HeartbeatReceiverSuite extends SparkFunSuite with LocalSparkContext { test("HeartbeatReceiver") { sc = spy(new SparkContext("local[2]", "test")) val scheduler = mock(classOf[TaskScheduler]) when(scheduler.executorHeartbeatReceived(any(), any(), any())).thenReturn(true) when(sc.taskScheduler).thenReturn(scheduler) val heartbeatReceiver = new HeartbeatReceiver(sc) sc.env.rpcEnv.setupEndpoint("heartbeat", heartbeatReceiver).send(TaskSchedulerIsSet) eventually(timeout(5 seconds), interval(5 millis)) { assert(heartbeatReceiver.scheduler != null) } val receiverRef = RpcUtils.makeDriverRef("heartbeat", sc.conf, sc.env.rpcEnv) val metrics = new TaskMetrics val blockManagerId = BlockManagerId("executor-1", "localhost", 12345) val response = receiverRef.askWithRetry[HeartbeatResponse]( Heartbeat("executor-1", Array(1L -> metrics), blockManagerId)) verify(scheduler).executorHeartbeatReceived( Matchers.eq("executor-1"), Matchers.eq(Array(1L -> metrics)), Matchers.eq(blockManagerId)) assert(false === response.reregisterBlockManager) } test("HeartbeatReceiver re-register") { sc = spy(new SparkContext("local[2]", "test")) val scheduler = mock(classOf[TaskScheduler]) when(scheduler.executorHeartbeatReceived(any(), any(), any())).thenReturn(false) when(sc.taskScheduler).thenReturn(scheduler) val heartbeatReceiver = new HeartbeatReceiver(sc) sc.env.rpcEnv.setupEndpoint("heartbeat", heartbeatReceiver).send(TaskSchedulerIsSet) eventually(timeout(5 seconds), interval(5 millis)) { assert(heartbeatReceiver.scheduler != null) } val receiverRef = RpcUtils.makeDriverRef("heartbeat", sc.conf, sc.env.rpcEnv) val metrics = new TaskMetrics val blockManagerId = BlockManagerId("executor-1", "localhost", 12345) val response = receiverRef.askWithRetry[HeartbeatResponse]( Heartbeat("executor-1", Array(1L -> metrics), blockManagerId)) verify(scheduler).executorHeartbeatReceived( Matchers.eq("executor-1"), Matchers.eq(Array(1L -> metrics)), Matchers.eq(blockManagerId)) assert(true === response.reregisterBlockManager) } }
Example 7
Source File: MapStatusSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.storage.BlockManagerId import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.serializer.JavaSerializer import scala.util.Random class MapStatusSuite extends SparkFunSuite { test("compressSize") { assert(MapStatus.compressSize(0L) === 0) assert(MapStatus.compressSize(1L) === 1) assert(MapStatus.compressSize(2L) === 8) assert(MapStatus.compressSize(10L) === 25) assert((MapStatus.compressSize(1000000L) & 0xFF) === 145) assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218) // This last size is bigger than we can encode in a byte, so check that we just return 255 assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255) } test("decompressSize") { assert(MapStatus.decompressSize(0) === 0) for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) { val size2 = MapStatus.decompressSize(MapStatus.compressSize(size)) assert(size2 >= 0.99 * size && size2 <= 1.11 * size, "size " + size + " decompressed to " + size2 + ", which is out of range") } } test("MapStatus should never report non-empty blocks' sizes as 0") { import Math._ for ( numSizes <- Seq(1, 10, 100, 1000, 10000); mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong); stddev <- Seq(0.0, 0.01, 0.5, 1.0) ) { val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean) val status = MapStatus(BlockManagerId("a", "b", 10), sizes) val status1 = compressAndDecompressMapStatus(status) for (i <- 0 until numSizes) { if (sizes(i) != 0) { val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev" assert(status.getSizeForBlock(i) !== 0, failureMessage) assert(status1.getSizeForBlock(i) !== 0, failureMessage) } } } } test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) { val sizes = Array.fill[Long](2001)(150L) val status = MapStatus(null, sizes) assert(status.isInstanceOf[HighlyCompressedMapStatus]) assert(status.getSizeForBlock(10) === 150L) assert(status.getSizeForBlock(50) === 150L) assert(status.getSizeForBlock(99) === 150L) assert(status.getSizeForBlock(2000) === 150L) } test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") { val sizes = Array.tabulate[Long](3000) { i => i.toLong } val avg = sizes.sum / sizes.filter(_ != 0).length val loc = BlockManagerId("a", "b", 10) val status = MapStatus(loc, sizes) val status1 = compressAndDecompressMapStatus(status) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) assert(status1.location == loc) for (i <- 0 until 3000) { val estimate = status1.getSizeForBlock(i) if (sizes(i) > 0) { assert(estimate === avg) } } } def compressAndDecompressMapStatus(status: MapStatus): MapStatus = { val ser = new JavaSerializer(new SparkConf) val buf = ser.newInstance().serialize(status) ser.newInstance().deserialize[MapStatus](buf) } }
Example 8
Source File: ShuffleMapStage.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.ShuffleDependency import org.apache.spark.rdd.RDD import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.CallSite def removeOutputsOnExecutor(execId: String): Unit = { var becameUnavailable = false for (partition <- 0 until numPartitions) { val prevList = outputLocs(partition) val newList = prevList.filterNot(_.location.executorId == execId) outputLocs(partition) = newList if (prevList != Nil && newList == Nil) { becameUnavailable = true numAvailableOutputs -= 1 } } if (becameUnavailable) { logInfo("%s is now unavailable on executor %s (%d/%d, %s)".format( this, execId, numAvailableOutputs, numPartitions, isAvailable)) } } }
Example 9
Source File: BlockStoreShuffleFetcher.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.util.{Failure, Success, Try} import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId} import org.apache.spark.util.CompletionIterator private[hash] object BlockStoreShuffleFetcher extends Logging { def fetch[T]( shuffleId: Int, reduceId: Int, context: TaskContext, serializer: Serializer) : Iterator[T] = { logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId)) val blockManager = SparkEnv.get.blockManager val startTime = System.currentTimeMillis val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId) logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format( shuffleId, reduceId, System.currentTimeMillis - startTime)) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]] for (((address, size), index) <- statuses.zipWithIndex) { splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size)) } val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map { case (address, splits) => (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2))) } def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = { val blockId = blockPair._1 val blockOption = blockPair._2 blockOption match { case Success(block) => { block.asInstanceOf[Iterator[T]] } case Failure(e) => { blockId match { case ShuffleBlockId(shufId, mapId, _) => val address = statuses(mapId.toInt)._1 throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) } } } } val blockFetcherItr = new ShuffleBlockFetcherIterator( context, SparkEnv.get.blockManager.shuffleClient, blockManager, blocksByAddress, serializer, // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024) val itr = blockFetcherItr.flatMap(unpackBlock) val completionIter = CompletionIterator[T, Iterator[T]](itr, { context.taskMetrics.updateShuffleReadMetrics() }) new InterruptibleIterator[T](context, completionIter) { val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() override def next(): T = { readMetrics.incRecordsRead(1) delegate.next() } } } }
Example 10
Source File: BlockTransferService.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 11
Source File: ShuffleMapTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.lang.management.ManagementFactory import java.nio.ByteBuffer import java.util.Properties import scala.language.existentials import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.executor.TaskMetrics import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.shuffle.ShuffleWriter import org.apache.spark.storage.BlockManagerId def this(partitionId: Int) { this(0, 0, null, new Partition { override def index: Int = 0 }, null, new Properties, null) } @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } var rdd: RDD[_] = null var dep: ShuffleDependency[_, _, _] = null override def prepTask(): Unit = { // Deserialize the RDD using the broadcast variable. val threadMXBean = ManagementFactory.getThreadMXBean val deserializeStartTime = System.currentTimeMillis() val deserializeStartCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime } else 0L val ser = SparkEnv.get.closureSerializer.newInstance() val (_rdd, _dep) = ser.deserialize[(RDD[_], ShuffleDependency[_, _, _])]( ByteBuffer.wrap(taskBinary.value), Thread.currentThread.getContextClassLoader) rdd = _rdd dep = _dep _executorDeserializeTime = System.currentTimeMillis() - deserializeStartTime _executorDeserializeCpuTime = if (threadMXBean.isCurrentThreadCpuTimeSupported) { threadMXBean.getCurrentThreadCpuTime - deserializeStartCpuTime } else 0L } override def runTask(context: TaskContext): MapStatus = { if (dep == null || rdd == null) { prepTask() } var writer: ShuffleWriter[Any, Any] = null try { val manager = SparkEnv.get.shuffleManager writer = manager.getWriter[Any, Any](dep.shuffleHandle, partitionId, context) writer.write(rdd.iterator(partition, context).asInstanceOf[Iterator[_ <: Product2[Any, Any]]]) val status = writer.stop(success = true).get FutureTaskNotifier.taskCompleted(status, partitionId, dep.shuffleId, dep.partitioner.numPartitions, nextStageLocs, metrics.shuffleWriteMetrics, false) status } catch { case e: Exception => try { if (writer != null) { writer.stop(success = false) } } catch { case e: Exception => log.debug("Could not stop writer", e) } throw e } } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "ShuffleMapTask(%d, %d)".format(stageId, partitionId) } object ShuffleMapTask { def apply( stageId: Int, stageAttemptId: Int, partition: Partition, properties: Properties, internalAccumulatorsSer: Array[Byte], isFutureTask: Boolean, rdd: RDD[_], dep: ShuffleDependency[_, _, _], nextStageLocs: Option[Seq[BlockManagerId]]): ShuffleMapTask = { val smt = new ShuffleMapTask(stageId, stageAttemptId, null, partition, null, properties, internalAccumulatorsSer, isFutureTask, nextStageLocs) smt.rdd = rdd smt.dep = dep smt } }
Example 12
Source File: MapStatusSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.storage.BlockManagerId import org.scalatest.FunSuite import org.apache.spark.SparkConf import org.apache.spark.serializer.JavaSerializer import scala.util.Random class MapStatusSuite extends FunSuite { test("compressSize") { assert(MapStatus.compressSize(0L) === 0) assert(MapStatus.compressSize(1L) === 1) assert(MapStatus.compressSize(2L) === 8) assert(MapStatus.compressSize(10L) === 25) assert((MapStatus.compressSize(1000000L) & 0xFF) === 145) assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218) // This last size is bigger than we can encode in a byte, so check that we just return 255 assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255) } test("decompressSize") { assert(MapStatus.decompressSize(0) === 0) for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) { val size2 = MapStatus.decompressSize(MapStatus.compressSize(size)) assert(size2 >= 0.99 * size && size2 <= 1.11 * size, "size " + size + " decompressed to " + size2 + ", which is out of range") } } test("MapStatus should never report non-empty blocks' sizes as 0") { import Math._ for ( numSizes <- Seq(1, 10, 100, 1000, 10000); mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong); stddev <- Seq(0.0, 0.01, 0.5, 1.0) ) { val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean) val status = MapStatus(BlockManagerId("a", "b", 10), sizes) val status1 = compressAndDecompressMapStatus(status) for (i <- 0 until numSizes) { if (sizes(i) != 0) { val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev" assert(status.getSizeForBlock(i) !== 0, failureMessage) assert(status1.getSizeForBlock(i) !== 0, failureMessage) } } } } test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) { val sizes = Array.fill[Long](2001)(150L) val status = MapStatus(null, sizes) assert(status.isInstanceOf[HighlyCompressedMapStatus]) assert(status.getSizeForBlock(10) === 150L) assert(status.getSizeForBlock(50) === 150L) assert(status.getSizeForBlock(99) === 150L) assert(status.getSizeForBlock(2000) === 150L) } test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") { val sizes = Array.tabulate[Long](3000) { i => i.toLong } val avg = sizes.sum / sizes.filter(_ != 0).length val loc = BlockManagerId("a", "b", 10) val status = MapStatus(loc, sizes) val status1 = compressAndDecompressMapStatus(status) assert(status1.isInstanceOf[HighlyCompressedMapStatus]) assert(status1.location == loc) for (i <- 0 until 3000) { val estimate = status1.getSizeForBlock(i) if (sizes(i) > 0) { assert(estimate === avg) } } } def compressAndDecompressMapStatus(status: MapStatus): MapStatus = { val ser = new JavaSerializer(new SparkConf) val buf = ser.newInstance().serialize(status) ser.newInstance().deserialize[MapStatus](buf) } }
Example 13
Source File: BlockStoreShuffleFetcher.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.util.{Failure, Success, Try} import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId} import org.apache.spark.util.CompletionIterator private[hash] object BlockStoreShuffleFetcher extends Logging { def fetch[T]( shuffleId: Int, reduceId: Int, context: TaskContext, serializer: Serializer) : Iterator[T] = { logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId)) val blockManager = SparkEnv.get.blockManager val startTime = System.currentTimeMillis val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId) logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format( shuffleId, reduceId, System.currentTimeMillis - startTime)) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]] for (((address, size), index) <- statuses.zipWithIndex) { splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size)) } val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map { case (address, splits) => (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2))) } def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = { val blockId = blockPair._1 val blockOption = blockPair._2 blockOption match { case Success(block) => { block.asInstanceOf[Iterator[T]] } case Failure(e) => { blockId match { case ShuffleBlockId(shufId, mapId, _) => val address = statuses(mapId.toInt)._1 throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) } } } } val blockFetcherItr = new ShuffleBlockFetcherIterator( context, SparkEnv.get.blockManager.shuffleClient, blockManager, blocksByAddress, serializer, SparkEnv.get.conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024) val itr = blockFetcherItr.flatMap(unpackBlock) val completionIter = CompletionIterator[T, Iterator[T]](itr, { context.taskMetrics.updateShuffleReadMetrics() }) new InterruptibleIterator[T](context, completionIter) { val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() override def next(): T = { readMetrics.incRecordsRead(1) delegate.next() } } } }
Example 14
Source File: BlockTransferService.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 15
Source File: ExternalClusterManagerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext { test("launch of backend and scheduler") { val conf = new SparkConf().setMaster("myclusterManager"). setAppName("testcm").set("spark.driver.allowMultipleContexts", "true") sc = new SparkContext(conf) // check if the scheduler components are created and initialized sc.schedulerBackend match { case dummy: DummySchedulerBackend => assert(dummy.initialized) case other => fail(s"wrong scheduler backend: ${other}") } sc.taskScheduler match { case dummy: DummyTaskScheduler => assert(dummy.initialized) case other => fail(s"wrong task scheduler: ${other}") } } } private class DummyExternalClusterManager extends ExternalClusterManager { def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager" def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = new DummyTaskScheduler def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend() def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[DummyTaskScheduler].initialized = true backend.asInstanceOf[DummySchedulerBackend].initialized = true } } private class DummySchedulerBackend extends SchedulerBackend { var initialized = false def start() {} def stop() {} def reviveOffers() {} def defaultParallelism(): Int = 1 } private class DummyTaskScheduler extends TaskScheduler { var initialized = false override def rootPool: Pool = null override def schedulingMode: SchedulingMode = SchedulingMode.NONE override def start(): Unit = {} override def stop(): Unit = {} override def submitTasks(taskSet: TaskSet): Unit = {} override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {} override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def applicationAttemptId(): Option[String] = None def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId): Boolean = true }
Example 16
Source File: OapMessages.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.oap.rpc import org.apache.spark.rpc.RpcEndpointRef import org.apache.spark.storage.BlockManagerId private[spark] object OapMessages { sealed trait OapMessage extends Serializable sealed trait ToOapRpcManagerSlave extends OapMessage sealed trait ToOapRpcManagerMaster extends OapMessage sealed trait Heartbeat extends ToOapRpcManagerMaster case class RegisterOapRpcManager( executorId: String, oapRpcManagerEndpoint: RpcEndpointRef) extends ToOapRpcManagerMaster case class DummyHeartbeat(someContent: String) extends Heartbeat case class FiberCacheHeartbeat( executorId: String, blockManagerId: BlockManagerId, content: String) extends Heartbeat case class FiberCacheMetricsHeartbeat( executorId: String, blockManagerId: BlockManagerId, content: String) extends Heartbeat }
Example 17
Source File: ExternalClusterManagerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkFunSuite} import org.apache.spark.scheduler.SchedulingMode.SchedulingMode import org.apache.spark.storage.BlockManagerId import org.apache.spark.util.AccumulatorV2 class ExternalClusterManagerSuite extends SparkFunSuite with LocalSparkContext { test("launch of backend and scheduler") { val conf = new SparkConf().setMaster("myclusterManager"). setAppName("testcm").set("spark.driver.allowMultipleContexts", "true") sc = new SparkContext(conf) // check if the scheduler components are created and initialized sc.schedulerBackend match { case dummy: DummySchedulerBackend => assert(dummy.initialized) case other => fail(s"wrong scheduler backend: ${other}") } sc.taskScheduler match { case dummy: DummyTaskScheduler => assert(dummy.initialized) case other => fail(s"wrong task scheduler: ${other}") } } } private class DummyExternalClusterManager extends ExternalClusterManager { def canCreate(masterURL: String): Boolean = masterURL == "myclusterManager" def createTaskScheduler(sc: SparkContext, masterURL: String): TaskScheduler = new DummyTaskScheduler def createSchedulerBackend(sc: SparkContext, masterURL: String, scheduler: TaskScheduler): SchedulerBackend = new DummySchedulerBackend() def initialize(scheduler: TaskScheduler, backend: SchedulerBackend): Unit = { scheduler.asInstanceOf[DummyTaskScheduler].initialized = true backend.asInstanceOf[DummySchedulerBackend].initialized = true } } private class DummySchedulerBackend extends SchedulerBackend { var initialized = false def start() {} def stop() {} def reviveOffers() {} def defaultParallelism(): Int = 1 } private class DummyTaskScheduler extends TaskScheduler { var initialized = false override def rootPool: Pool = null override def schedulingMode: SchedulingMode = SchedulingMode.NONE override def start(): Unit = {} override def stop(): Unit = {} override def submitTasks(taskSet: TaskSet): Unit = {} override def cancelTasks(stageId: Int, interruptThread: Boolean): Unit = {} override def setDAGScheduler(dagScheduler: DAGScheduler): Unit = {} override def defaultParallelism(): Int = 2 override def executorLost(executorId: String, reason: ExecutorLossReason): Unit = {} override def applicationAttemptId(): Option[String] = None def executorHeartbeatReceived( execId: String, accumUpdates: Array[(Long, Seq[AccumulatorV2[_, _]])], blockManagerId: BlockManagerId): Boolean = true }
Example 18
Source File: BatchShuffleMapTask.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import java.util.Properties import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.broadcast.Broadcast import org.apache.spark.internal.Logging import org.apache.spark.rdd.RDD import org.apache.spark.storage.BlockManagerId private[spark] class BatchShuffleMapTask( stageId: Int, stageAttemptId: Int, taskBinaries: Broadcast[Array[Byte]], partitions: Array[Partition], partitionId: Int, @transient private var locs: Seq[TaskLocation], internalAccumulatorsSer: Array[Byte], localProperties: Properties, isFutureTask: Boolean, nextStageLocs: Option[Seq[BlockManagerId]] = None, depShuffleIds: Option[Seq[Seq[Int]]] = None, depShuffleNumMaps: Option[Seq[Int]] = None, jobId: Option[Int] = None, appId: Option[String] = None, appAttemptId: Option[String] = None) extends Task[Array[MapStatus]](stageId, stageAttemptId, partitionId, internalAccumulatorsSer, localProperties, isFutureTask, depShuffleIds, depShuffleNumMaps, jobId, appId, appAttemptId) with BatchTask with Logging { @transient private val preferredLocs: Seq[TaskLocation] = { if (locs == null) Nil else locs.toSet.toSeq } var rdds: Array[RDD[_]] = null var deps: Array[ShuffleDependency[_, _, _]] = null override def prepTask(): Unit = { // Deserialize the RDD using the broadcast variable. val ser = SparkEnv.get.closureSerializer.newInstance() val (rddI, depI) = ser.deserialize[(Array[RDD[_]], Array[ShuffleDependency[_, _, _]])]( ByteBuffer.wrap(taskBinaries.value), Thread.currentThread.getContextClassLoader) rdds = rddI deps = depI } def getTasks(): Seq[Task[Any]] = { if (deps == null || rdds == null) { prepTask() } (0 until partitions.length).map { i => val s = ShuffleMapTask(stageId, stageAttemptId, partitions(i), localProperties, internalAccumulatorsSer, isFutureTask, rdds(i), deps(i), nextStageLocs) s.epoch = epoch s }.map(_.asInstanceOf[Task[Any]]) } override def runTask(context: TaskContext): Array[MapStatus] = { throw new RuntimeException("BatchShuffleMapTasks should not be run!") } override def preferredLocations: Seq[TaskLocation] = preferredLocs override def toString: String = "BatchShuffleMapTask(%d, %d)".format(stageId, partitionId) }
Example 19
Source File: FutureTaskNotifier.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockManagerId import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.storage.StorageLevel private[spark] object FutureTaskNotifier extends Logging { def taskCompleted( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], shuffleWriteMetrics: ShuffleWriteMetrics, skipZeroByteNotifications: Boolean): Unit = { if (!nextStageLocs.isEmpty && numReduces == nextStageLocs.get.length) { val drizzleRpcsStart = System.nanoTime sendMapStatusToNextTaskLocations(status, mapId, shuffleId, numReduces, nextStageLocs, skipZeroByteNotifications) shuffleWriteMetrics.incWriteTime(System.nanoTime - drizzleRpcsStart) } else { logInfo( s"No taskCompletion next: ${nextStageLocs.map(_.length).getOrElse(0)} r: $numReduces") } } // Push metadata saying that this map task finished, so that the tasks in the next stage // know they can begin pulling the data. private def sendMapStatusToNextTaskLocations( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], skipZeroByteNotifications: Boolean) { val numReduces = nextStageLocs.get.length val uniqueLocations = if (skipZeroByteNotifications) { nextStageLocs.get.zipWithIndex.filter { x => status.getSizeForBlock(x._2) != 0L }.map(_._1).toSet } else { nextStageLocs.get.toSet } uniqueLocations.foreach { blockManagerId => try { SparkEnv.get.blockManager.blockTransferService.mapOutputReady( blockManagerId.host, blockManagerId.port, shuffleId, mapId, numReduces, status) } catch { case e: Exception => logWarning(s"Failed to send map outputs to $blockManagerId", e) } } } }