org.apache.spark.storage.BlockManager Scala Examples
The following examples show how to use org.apache.spark.storage.BlockManager.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: FutureTaskWaiter.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashSet import org.apache.spark.internal.Logging import org.apache.spark.MapOutputTracker import org.apache.spark.SparkConf import org.apache.spark.storage.BlockManager import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.TimeStampedHashMap private[spark] case class FutureTaskInfo(shuffleId: Int, numMaps: Int, reduceId: Int, taskId: Long, nonZeroPartitions: Option[Array[Int]], taskCb: () => Unit) private[spark] class FutureTaskWaiter( conf: SparkConf, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends Logging { // Key is (shuffleId, reduceId) private val futureTaskInfo = new TimeStampedHashMap[(Int, Int), FutureTaskInfo] // Key is (shuffleId, reduceId), value is the set of blockIds we are waiting for private val futureTasksBlockWait = new TimeStampedHashMap[(Int, Int), HashSet[Int]] def submitFutureTask(info: FutureTaskInfo) { futureTasksBlockWait.synchronized { val blocksToWaitFor = if (info.nonZeroPartitions.isDefined) { info.nonZeroPartitions.get.toSet } else { (0 until info.numMaps).toArray.toSet } // Check if all the blocks already exist. If so just trigger taskCb // Count how many outputs have been registered with the MapOutputTracker for this shuffle // and intersect with blocksToWaitFor to only get how many for this reduce are available val availableBlocks = mapOutputTracker.getAvailableMapOutputs(info.shuffleId).intersect(blocksToWaitFor) val mapsToWait = blocksToWaitFor.size val numMapsPending = blocksToWaitFor.size - availableBlocks.size if (availableBlocks.size >= mapsToWait) { info.taskCb() } else { futureTaskInfo.put((info.shuffleId, info.reduceId), info) // NOTE: Its fine not to synchronize here as two future tasks shouldn't be submitted at the // same time Calculate the number of blocks to wait for before starting future task val waitForBlocks = blocksToWaitFor.diff(availableBlocks) futureTasksBlockWait.put( (info.shuffleId, info.reduceId), new HashSet[Int]() ++ waitForBlocks) } } } def shuffleBlockReady(shuffleBlockId: ShuffleBlockId): Unit = { val key = (shuffleBlockId.shuffleId, shuffleBlockId.reduceId) futureTasksBlockWait.synchronized { if (futureTaskInfo.contains(key)) { if (futureTasksBlockWait.contains(key)) { futureTasksBlockWait(key) -= shuffleBlockId.mapId // If we have all the blocks, run the CB if (futureTasksBlockWait(key).size <= 0) { val cb = futureTaskInfo(key).taskCb futureTasksBlockWait.remove(key) futureTaskInfo.remove(key) cb() } } } } } def addMapStatusAvailable(shuffleId: Int, mapId: Int, numReduces: Int, mapStatus: MapStatus) { // NOTE: This should be done before we trigger future tasks. mapOutputTracker.addStatus(shuffleId, mapId, mapStatus) futureTasksBlockWait.synchronized { // Register the output for each reduce task. (0 until numReduces).foreach { reduceId => shuffleBlockReady(new ShuffleBlockId(shuffleId, mapId, reduceId)) } } } }
Example 2
Source File: BlockRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map { i => new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 3
Source File: OapRpcManagerSlave.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.oap.rpc import java.util.concurrent.TimeUnit import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv, ThreadSafeRpcEndpoint} import org.apache.spark.sql.execution.datasources.oap.filecache.{CacheStats, FiberCacheManager} import org.apache.spark.sql.internal.oap.OapConf import org.apache.spark.sql.oap.adapter.RpcEndpointRefAdapter import org.apache.spark.sql.oap.rpc.OapMessages._ import org.apache.spark.storage.BlockManager import org.apache.spark.util.{ThreadUtils, Utils} private[spark] class OapRpcManagerSlave( rpcEnv: RpcEnv, val driverEndpoint: RpcEndpointRef, executorId: String, blockManager: BlockManager, fiberCacheManager: FiberCacheManager, conf: SparkConf) extends OapRpcManager { // Send OapHeartbeatMessage to Driver timed private val oapHeartbeater = ThreadUtils.newDaemonSingleThreadScheduledExecutor("driver-heartbeater") private val slaveEndpoint = rpcEnv.setupEndpoint( s"OapRpcManagerSlave_$executorId", new OapRpcManagerSlaveEndpoint(rpcEnv, fiberCacheManager)) initialize() startOapHeartbeater() protected def heartbeatMessages: Array[() => Heartbeat] = { Array( () => FiberCacheHeartbeat( executorId, blockManager.blockManagerId, fiberCacheManager.status()), () => FiberCacheMetricsHeartbeat(executorId, blockManager.blockManagerId, CacheStats.status(fiberCacheManager.cacheStats, conf))) } private def initialize() = { RpcEndpointRefAdapter.askSync[Boolean]( driverEndpoint, RegisterOapRpcManager(executorId, slaveEndpoint)) } override private[spark] def send(message: OapMessage): Unit = { driverEndpoint.send(message) } private[sql] def startOapHeartbeater(): Unit = { def reportHeartbeat(): Unit = { // OapRpcManagerSlave is created in SparkEnv. Before we start the heartbeat, we need make // sure the SparkEnv has been created and the block manager has been initialized. We check // blockManagerId as it will be set after initialization. if (blockManager.blockManagerId != null) { heartbeatMessages.map(_.apply()).foreach(send) } } val intervalMs = conf.getTimeAsMs( OapConf.OAP_HEARTBEAT_INTERVAL.key, OapConf.OAP_HEARTBEAT_INTERVAL.defaultValue.get) // Wait a random interval so the heartbeats don't end up in sync val initialDelay = intervalMs + (math.random * intervalMs).asInstanceOf[Int] val heartbeatTask = new Runnable() { override def run(): Unit = Utils.logUncaughtExceptions(reportHeartbeat()) } oapHeartbeater.scheduleAtFixedRate( heartbeatTask, initialDelay, intervalMs, TimeUnit.MILLISECONDS) } override private[spark] def stop(): Unit = { oapHeartbeater.shutdown() } } private[spark] class OapRpcManagerSlaveEndpoint( override val rpcEnv: RpcEnv, fiberCacheManager: FiberCacheManager) extends ThreadSafeRpcEndpoint with Logging { override def receive: PartialFunction[Any, Unit] = { case message: OapMessage => handleOapMessage(message) case _ => } private def handleOapMessage(message: OapMessage): Unit = message match { case CacheDrop(indexName) => fiberCacheManager.releaseIndexCache(indexName) case _ => } }
Example 4
Source File: BlockRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map { i => new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 5
Source File: BufferMessage.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network.nio import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.storage.BlockManager private[nio] class BufferMessage(id_ : Int, val buffers: ArrayBuffer[ByteBuffer], var ackId: Int) extends Message(Message.BUFFER_MESSAGE, id_) { val initialSize = currentSize() var gotChunkForSendingOnce = false def size = initialSize def currentSize() = { if (buffers == null || buffers.isEmpty) { 0 } else { buffers.map(_.remaining).reduceLeft(_ + _) } } def getChunkForSending(maxChunkSize: Int): Option[MessageChunk] = { if (maxChunkSize <= 0) { throw new Exception("Max chunk size is " + maxChunkSize) } val security = if (isSecurityNeg) 1 else 0 if (size == 0 && !gotChunkForSendingOnce) { val newChunk = new MessageChunk( new MessageChunkHeader(typ, id, 0, 0, ackId, hasError, security, senderAddress), null) gotChunkForSendingOnce = true return Some(newChunk) } while(!buffers.isEmpty) { val buffer = buffers(0) if (buffer.remaining == 0) { BlockManager.dispose(buffer) buffers -= buffer } else { val newBuffer = if (buffer.remaining <= maxChunkSize) { buffer.duplicate() } else { buffer.slice().limit(maxChunkSize).asInstanceOf[ByteBuffer] } buffer.position(buffer.position + newBuffer.remaining) val newChunk = new MessageChunk(new MessageChunkHeader( typ, id, size, newBuffer.remaining, ackId, hasError, security, senderAddress), newBuffer) gotChunkForSendingOnce = true return Some(newChunk) } } None } def getChunkForReceiving(chunkSize: Int): Option[MessageChunk] = { // STRONG ASSUMPTION: BufferMessage created when receiving data has ONLY ONE data buffer if (buffers.size > 1) { throw new Exception("Attempting to get chunk from message with multiple data buffers") } val buffer = buffers(0) val security = if (isSecurityNeg) 1 else 0 if (buffer.remaining > 0) { if (buffer.remaining < chunkSize) { throw new Exception("Not enough space in data buffer for receiving chunk") } val newBuffer = buffer.slice().limit(chunkSize).asInstanceOf[ByteBuffer] buffer.position(buffer.position + newBuffer.remaining) val newChunk = new MessageChunk(new MessageChunkHeader( typ, id, size, newBuffer.remaining, ackId, hasError, security, senderAddress), newBuffer) return Some(newChunk) } None } def flip() { buffers.foreach(_.flip) } def hasAckId() = (ackId != 0) def isCompletelyReceived() = !buffers(0).hasRemaining override def toString = { if (hasAckId) { "BufferAckMessage(aid = " + ackId + ", id = " + id + ", size = " + size + ")" } else { "BufferMessage(id = " + id + ", size = " + size + ")" } } }
Example 6
Source File: BlockRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import scala.Some private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val locations_ = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.size).map(i => { new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() locations_(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!_isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { locations_ } }
Example 7
Source File: BlockRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import org.apache.spark.util.Utils private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get(sc._sparkUser)) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map { i => new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val user = Utils.getCurrentUserName() val blockManager = SparkEnv.get(user).blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 8
Source File: BufferMessage.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.network.nio import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.storage.BlockManager private[nio] class BufferMessage(id_ : Int, val buffers: ArrayBuffer[ByteBuffer], var ackId: Int) extends Message(Message.BUFFER_MESSAGE, id_) { val initialSize = currentSize() var gotChunkForSendingOnce = false def size: Int = initialSize def currentSize(): Int = { if (buffers == null || buffers.isEmpty) { 0 } else { buffers.map(_.remaining).reduceLeft(_ + _) } } def getChunkForSending(maxChunkSize: Int): Option[MessageChunk] = { if (maxChunkSize <= 0) { throw new Exception("Max chunk size is " + maxChunkSize) } val security = if (isSecurityNeg) 1 else 0 if (size == 0 && !gotChunkForSendingOnce) { val newChunk = new MessageChunk( new MessageChunkHeader(typ, id, 0, 0, ackId, hasError, security, senderAddress), null) gotChunkForSendingOnce = true return Some(newChunk) } while(!buffers.isEmpty) { val buffer = buffers(0) if (buffer.remaining == 0) { BlockManager.dispose(buffer) buffers -= buffer } else { val newBuffer = if (buffer.remaining <= maxChunkSize) { buffer.duplicate() } else { buffer.slice().limit(maxChunkSize).asInstanceOf[ByteBuffer] } buffer.position(buffer.position + newBuffer.remaining) val newChunk = new MessageChunk(new MessageChunkHeader( typ, id, size, newBuffer.remaining, ackId, hasError, security, senderAddress), newBuffer) gotChunkForSendingOnce = true return Some(newChunk) } } None } def getChunkForReceiving(chunkSize: Int): Option[MessageChunk] = { // STRONG ASSUMPTION: BufferMessage created when receiving data has ONLY ONE data buffer if (buffers.size > 1) { throw new Exception("Attempting to get chunk from message with multiple data buffers") } val buffer = buffers(0) val security = if (isSecurityNeg) 1 else 0 if (buffer.remaining > 0) { if (buffer.remaining < chunkSize) { throw new Exception("Not enough space in data buffer for receiving chunk") } val newBuffer = buffer.slice().limit(chunkSize).asInstanceOf[ByteBuffer] buffer.position(buffer.position + newBuffer.remaining) val newChunk = new MessageChunk(new MessageChunkHeader( typ, id, size, newBuffer.remaining, ackId, hasError, security, senderAddress), newBuffer) return Some(newChunk) } None } def flip() { buffers.foreach(_.flip) } def hasAckId(): Boolean = ackId != 0 def isCompletelyReceived: Boolean = !buffers(0).hasRemaining override def toString: String = { if (hasAckId) { "BufferAckMessage(aid = " + ackId + ", id = " + id + ", size = " + size + ")" } else { "BufferMessage(id = " + id + ", size = " + size + ")" } } }
Example 9
Source File: BlockRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import scala.Some private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map(i => { new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 10
Source File: BufferMessage.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.network.nio import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.storage.BlockManager private[nio] class BufferMessage(id_ : Int, val buffers: ArrayBuffer[ByteBuffer], var ackId: Int) extends Message(Message.BUFFER_MESSAGE, id_) { val initialSize = currentSize() var gotChunkForSendingOnce = false def size: Int = initialSize def currentSize(): Int = { if (buffers == null || buffers.isEmpty) { 0 } else { buffers.map(_.remaining).reduceLeft(_ + _) } } def getChunkForSending(maxChunkSize: Int): Option[MessageChunk] = { if (maxChunkSize <= 0) { throw new Exception("Max chunk size is " + maxChunkSize) } val security = if (isSecurityNeg) 1 else 0 if (size == 0 && !gotChunkForSendingOnce) { val newChunk = new MessageChunk( new MessageChunkHeader(typ, id, 0, 0, ackId, hasError, security, senderAddress), null) gotChunkForSendingOnce = true return Some(newChunk) } while(!buffers.isEmpty) { val buffer = buffers(0) if (buffer.remaining == 0) { BlockManager.dispose(buffer) buffers -= buffer } else { val newBuffer = if (buffer.remaining <= maxChunkSize) { buffer.duplicate() } else { buffer.slice().limit(maxChunkSize).asInstanceOf[ByteBuffer] } buffer.position(buffer.position + newBuffer.remaining) val newChunk = new MessageChunk(new MessageChunkHeader( typ, id, size, newBuffer.remaining, ackId, hasError, security, senderAddress), newBuffer) gotChunkForSendingOnce = true return Some(newChunk) } } None } def getChunkForReceiving(chunkSize: Int): Option[MessageChunk] = { // STRONG ASSUMPTION: BufferMessage created when receiving data has ONLY ONE data buffer //强烈的假设:在接收数据时创建的BufferMessage只有一个数据缓冲区 if (buffers.size > 1) { throw new Exception("Attempting to get chunk from message with multiple data buffers") } val buffer = buffers(0) val security = if (isSecurityNeg) 1 else 0 if (buffer.remaining > 0) { if (buffer.remaining < chunkSize) { throw new Exception("Not enough space in data buffer for receiving chunk") } val newBuffer = buffer.slice().limit(chunkSize).asInstanceOf[ByteBuffer] buffer.position(buffer.position + newBuffer.remaining) val newChunk = new MessageChunk(new MessageChunkHeader( typ, id, size, newBuffer.remaining, ackId, hasError, security, senderAddress), newBuffer) return Some(newChunk) } None } def flip() { buffers.foreach(_.flip) } def hasAckId(): Boolean = ackId != 0 def isCompletelyReceived: Boolean = !buffers(0).hasRemaining override def toString: String = { if (hasAckId) { "BufferAckMessage(aid = " + ackId + ", id = " + id + ", size = " + size + ")" } else { "BufferMessage(id = " + id + ", size = " + size + ")" } } }
Example 11
Source File: BlockRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import scala.Some private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map(i => { new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } //返回每个 partiton 都对应一组 hosts,这组 hosts 上往往存放着该 partition 的输入数据 override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 12
Source File: BlockRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map { i => new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception(s"Could not compute split, block $blockId of RDD $id not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 13
Source File: BlockRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import scala.Some private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map(i => { new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }