org.apache.spark.storage.BlockId Scala Examples
The following examples show how to use org.apache.spark.storage.BlockId.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: BlockRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import scala.Some private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map(i => { new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } //返回每个 partiton 都对应一组 hosts,这组 hosts 上往往存放着该 partition 的输入数据 override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 2
Source File: TaskResult.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get(user).serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 3
Source File: BlockRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import org.apache.spark.util.Utils private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get(sc._sparkUser)) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map { i => new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val user = Utils.getCurrentUserName() val blockManager = SparkEnv.get(user).blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 4
Source File: TestMemoryManager.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import org.apache.spark.SparkConf import org.apache.spark.storage.BlockId class TestMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1, Long.MaxValue, Long.MaxValue) { override private[memory] def acquireExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Long = { if (oomOnce) { oomOnce = false 0 } else if (available >= numBytes) { available -= numBytes numBytes } else { val grant = available available = 0 grant } } override def acquireStorageMemory( blockId: BlockId, numBytes: Long, memoryMode: MemoryMode): Boolean = true override def acquireUnrollMemory( blockId: BlockId, numBytes: Long, memoryMode: MemoryMode): Boolean = true override def releaseStorageMemory(numBytes: Long, memoryMode: MemoryMode): Unit = {} override private[memory] def releaseExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Unit = { available += numBytes } override def maxOnHeapStorageMemory: Long = Long.MaxValue override def maxOffHeapStorageMemory: Long = 0L private var oomOnce = false private var available = Long.MaxValue def markExecutionAsOutOfMemoryOnce(): Unit = { oomOnce = true } def limit(avail: Long): Unit = { available = avail } }
Example 5
Source File: BlockTransferService.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 6
Source File: NettyBlockRpcServer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 7
Source File: BlockStoreShuffleFetcher.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.util.{Failure, Success, Try} import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId} import org.apache.spark.util.CompletionIterator private[hash] object BlockStoreShuffleFetcher extends Logging { def fetch[T]( shuffleId: Int, reduceId: Int, context: TaskContext, serializer: Serializer) : Iterator[T] = { logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId)) val blockManager = SparkEnv.get.blockManager val startTime = System.currentTimeMillis val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId) logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format( shuffleId, reduceId, System.currentTimeMillis - startTime)) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]] for (((address, size), index) <- statuses.zipWithIndex) { splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size)) } val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map { case (address, splits) => (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2))) } def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = { val blockId = blockPair._1 val blockOption = blockPair._2 blockOption match { case Success(block) => { block.asInstanceOf[Iterator[T]] } case Failure(e) => { blockId match { case ShuffleBlockId(shufId, mapId, _) => val address = statuses(mapId.toInt)._1 throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) } } } } val blockFetcherItr = new ShuffleBlockFetcherIterator( context, SparkEnv.get.blockManager.shuffleClient, blockManager, blocksByAddress, serializer, // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024) val itr = blockFetcherItr.flatMap(unpackBlock) val completionIter = CompletionIterator[T, Iterator[T]](itr, { context.taskMetrics.updateShuffleReadMetrics() }) new InterruptibleIterator[T](context, completionIter) { val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() override def next(): T = { readMetrics.incRecordsRead(1) delegate.next() } } } }
Example 8
Source File: TaskResult.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.Map import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value. val resultSer = SparkEnv.get.serializer.newInstance() valueObject = resultSer.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 9
Source File: BlockRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import scala.Some private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map(i => { new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 10
Source File: KinesisInputDStream.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[Array[Byte]](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[Array[Byte]] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[Array[Byte]] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, awsCredentialsOption) } }
Example 11
Source File: NettyBlockRpcServer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { //消息解码 val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { //提供下载Block文件的功能, case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = //数据blockIds,存放BlockId,获得块数据 openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) //提供上传Block文件的RPC服务 case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. //使用我们的JavaSerializer将StorageLevel序列化为字节 //存储级别 val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) //存储局部块,使用给定的存储级别 blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 12
Source File: TaskResult.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.Map import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. //任务结果,还包含累加器变量的更新, private[spark] sealed trait TaskResult[T] def value(): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value. //这不应该在持有锁时运行,因为它可能花费数十秒钟值 val resultSer = SparkEnv.get.serializer.newInstance() valueObject = resultSer.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 13
Source File: NettyBlockRpcServer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 14
Source File: BlockTransferService.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient, TempFileManager} import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }
Example 15
Source File: NettyBlockRpcServer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.NioManagedBuffer import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length val blocks = for (i <- (0 until blocksNum).view) yield blockManager.getBlockData(BlockId.apply(openBlocks.blockIds(i))) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with $blocksNum buffers") responseContext.onSuccess(new StreamHandle(streamId, blocksNum).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 16
Source File: TaskResult.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 17
Source File: BlockRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map { i => new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception(s"Could not compute split, block $blockId of RDD $id not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 18
Source File: TestMemoryManager.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import org.apache.spark.SparkConf import org.apache.spark.storage.BlockId class TestMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1, Long.MaxValue, Long.MaxValue) { override private[memory] def acquireExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Long = { if (consequentOOM > 0) { consequentOOM -= 1 0 } else if (available >= numBytes) { available -= numBytes numBytes } else { val grant = available available = 0 grant } } override def acquireStorageMemory( blockId: BlockId, numBytes: Long, memoryMode: MemoryMode): Boolean = true override def acquireUnrollMemory( blockId: BlockId, numBytes: Long, memoryMode: MemoryMode): Boolean = true override def releaseStorageMemory(numBytes: Long, memoryMode: MemoryMode): Unit = {} override private[memory] def releaseExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Unit = { available += numBytes } override def maxOnHeapStorageMemory: Long = Long.MaxValue override def maxOffHeapStorageMemory: Long = 0L private var consequentOOM = 0 private var available = Long.MaxValue def markExecutionAsOutOfMemoryOnce(): Unit = { markconsequentOOM(1) } def markconsequentOOM(n : Int) : Unit = { consequentOOM += n } def limit(avail: Long): Unit = { available = avail } }
Example 19
Source File: KinesisInputDStream.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo import org.apache.spark.streaming.{Duration, StreamingContext, Time} private[kinesis] class KinesisInputDStream[T: ClassTag]( @transient _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 20
Source File: BlockTransferService.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 21
Source File: NettyBlockRpcServer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 22
Source File: TaskResult.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.Map import scala.collection.mutable import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value. val resultSer = SparkEnv.get.serializer.newInstance() valueObject = resultSer.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 23
Source File: BlockRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import scala.Some private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map(i => { new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 24
Source File: TestMemoryManager.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import scala.collection.mutable import org.apache.spark.SparkConf import org.apache.spark.storage.{BlockStatus, BlockId} class TestMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1, Long.MaxValue, Long.MaxValue) { override private[memory] def acquireExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Long = { if (oomOnce) { oomOnce = false 0 } else if (available >= numBytes) { available -= numBytes numBytes } else { val grant = available available = 0 grant } } override def acquireStorageMemory( blockId: BlockId, numBytes: Long, evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true override def acquireUnrollMemory( blockId: BlockId, numBytes: Long, evictedBlocks: mutable.Buffer[(BlockId, BlockStatus)]): Boolean = true override def releaseStorageMemory(numBytes: Long): Unit = {} override private[memory] def releaseExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Unit = { available += numBytes } override def maxStorageMemory: Long = Long.MaxValue private var oomOnce = false private var available = Long.MaxValue def markExecutionAsOutOfMemoryOnce(): Unit = { oomOnce = true } def limit(avail: Long): Unit = { available = avail } }
Example 25
Source File: NettyBlockRpcServer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 26
Source File: StorageMemoryPool.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import javax.annotation.concurrent.GuardedBy import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockId import org.apache.spark.storage.memory.MemoryStore def freeSpaceToShrinkPool(spaceToFree: Long): Long = lock.synchronized { val spaceFreedByReleasingUnusedMemory = math.min(spaceToFree, memoryFree) val remainingSpaceToFree = spaceToFree - spaceFreedByReleasingUnusedMemory if (remainingSpaceToFree > 0) { // If reclaiming free memory did not adequately shrink the pool, begin evicting blocks: val spaceFreedByEviction = memoryStore.evictBlocksToFreeSpace(None, remainingSpaceToFree, memoryMode) // When a block is released, BlockManager.dropFromMemory() calls releaseMemory(), so we do // not need to decrement _memoryUsed here. However, we do need to decrement the pool size. spaceFreedByReleasingUnusedMemory + spaceFreedByEviction } else { spaceFreedByReleasingUnusedMemory } } }
Example 27
Source File: BlockTransferService.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient} import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }
Example 28
Source File: NettyBlockRpcServer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, MapOutputReady, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.scheduler.MapStatus import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) case mapOutputReady: MapOutputReady => val mapStatus: MapStatus = serializer.newInstance().deserialize(ByteBuffer.wrap(mapOutputReady.serializedMapStatus)) blockManager.mapOutputReady( mapOutputReady.shuffleId, mapOutputReady.mapId, mapOutputReady.numReduces, mapStatus) } } override def getStreamManager(): StreamManager = streamManager }
Example 29
Source File: TaskResult.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 30
Source File: BlockRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map { i => new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 31
Source File: TestMemoryManager.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import org.apache.spark.SparkConf import org.apache.spark.storage.BlockId class TestMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1, Long.MaxValue, Long.MaxValue) { override private[memory] def acquireExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Long = { if (oomOnce) { oomOnce = false 0 } else if (available >= numBytes) { available -= numBytes numBytes } else { val grant = available available = 0 grant } } override def acquireStorageMemory( blockId: BlockId, numBytes: Long, memoryMode: MemoryMode): Boolean = true override def acquireUnrollMemory( blockId: BlockId, numBytes: Long, memoryMode: MemoryMode): Boolean = true override def releaseStorageMemory(numBytes: Long, memoryMode: MemoryMode): Unit = {} override private[memory] def releaseExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Unit = { available += numBytes } override def maxOnHeapStorageMemory: Long = Long.MaxValue override def maxOffHeapStorageMemory: Long = 0L private var oomOnce = false private var available = Long.MaxValue def markExecutionAsOutOfMemoryOnce(): Unit = { oomOnce = true } def limit(avail: Long): Unit = { available = avail } }
Example 32
Source File: SplashShuffleReader.scala From splash with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockId import org.apache.spark.{InterruptibleIterator, MapOutputTracker, SparkEnv, TaskContext} private[spark] class SplashShuffleReader[K, V]( resolver: SplashShuffleBlockResolver, handle: BaseShuffleHandle[K, _, V], startPartition: Int, endPartition: Int, context: TaskContext, mapOutputTracker: MapOutputTracker = SparkEnv.get.mapOutputTracker) extends ShuffleReader[K, V] with Logging { private val dep = handle.dependency private type Pair = (Any, Any) private type KCPair = (K, V) private type KCIterator = Iterator[KCPair] override def read(): KCIterator = { val shuffleBlocks = mapOutputTracker.getMapSizesByExecutorId( handle.shuffleId, startPartition, endPartition) .flatMap(_._2) readShuffleBlocks(shuffleBlocks) } def readShuffleBlocks(shuffleBlocks: Seq[(BlockId, Long)]): KCIterator = readShuffleBlocks(shuffleBlocks.iterator) def readShuffleBlocks(shuffleBlocks: Iterator[(BlockId, Long)]): KCIterator = { val taskMetrics = context.taskMetrics() val serializer = SplashSerializer(dep) val nonEmptyBlocks = shuffleBlocks.filter(_._2 > 0).map(_._1) val fetcherIterator = SplashShuffleFetcherIterator(resolver, nonEmptyBlocks) def getAggregatedIterator(iterator: Iterator[Pair]): KCIterator = { dep.aggregator match { case Some(agg) => val aggregator = new SplashAggregator(agg) if (dep.mapSideCombine) { // We are reading values that are already combined val combinedKeyValuesIterator = iterator.asInstanceOf[Iterator[(K, V)]] aggregator.combineCombinersByKey(combinedKeyValuesIterator, context) } else { // We don't know the value type, but also don't care -- the dependency *should* // have made sure its compatible w/ this aggregator, which will convert the value // type to the combined type C val keyValuesIterator = iterator.asInstanceOf[Iterator[(K, Nothing)]] aggregator.combineValuesByKey(keyValuesIterator, context) } case None => require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!") iterator.asInstanceOf[KCIterator] } } def getSortedIterator(iterator: KCIterator): KCIterator = { // Sort the output if there is a sort ordering defined. dep.keyOrdering match { case Some(keyOrd: Ordering[K]) => // Create an ExternalSorter to sort the data. val sorter = new SplashSorter[K, V, V]( context, ordering = Some(keyOrd), serializer = serializer) sorter.insertAll(iterator) sorter.updateTaskMetrics() sorter.completionIterator case None => iterator } } val metricIter = fetcherIterator.flatMap( _.asMetricIterator(serializer, taskMetrics)) // An interruptible iterator must be used here in order to support task cancellation getSortedIterator( getAggregatedIterator( new InterruptibleIterator[Pair](context, metricIter))) } }
Example 33
Source File: MyNettyBlockRpcServer.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.language.existentials import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.client.{RpcResponseCallback, StreamCallbackWithID, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.remote.{HadoopFileSegmentManagedBuffer, MessageForHadoopManagedBuffers, RemoteShuffleManager} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.storage.{BlockId, ShuffleBlockId} class MyNettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length val isShuffleRequest = (blocksNum > 0) && BlockId.apply(openBlocks.blockIds(0)).isInstanceOf[ShuffleBlockId] && (SparkEnv.get.conf.get("spark.shuffle.manager", classOf[SortShuffleManager].getName) == classOf[RemoteShuffleManager].getName) if (isShuffleRequest) { val blockIdAndManagedBufferPair = openBlocks.blockIds.map(block => (block, blockManager.getHostLocalShuffleData( BlockId.apply(block), Array.empty).asInstanceOf[HadoopFileSegmentManagedBuffer])) responseContext.onSuccess(new MessageForHadoopManagedBuffers( blockIdAndManagedBufferPair).toByteBuffer.nioBuffer()) } else { // This customized Netty RPC server is only served for RemoteShuffle requests, // Other RPC messages or data chunks transferring should go through // NettyBlockTransferService' NettyBlockRpcServer throw new UnsupportedOperationException("MyNettyBlockRpcServer only serves remote" + " shuffle requests for OpenBlocks") } case uploadBlock: UploadBlock => throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't serve UploadBlock") } } override def receiveStream( client: TransportClient, messageHeader: ByteBuffer, responseContext: RpcResponseCallback): StreamCallbackWithID = { throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't support receiving" + " stream") } override def getStreamManager(): StreamManager = streamManager }
Example 34
Source File: RemoteShuffleUtils.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.remote import java.util.UUID import org.apache.hadoop.fs.Path import org.apache.spark.SparkEnv import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.serializer.{SerializerInstance, SerializerManager} import org.apache.spark.shuffle.ShuffleWriteMetricsReporter import org.apache.spark.storage.{BlockId, TempLocalBlockId, TempShuffleBlockId} object RemoteShuffleUtils { val env = SparkEnv.get def getRemoteWriter( blockId: BlockId, file: Path, serializerManager: SerializerManager, serializerInstance: SerializerInstance, bufferSize: Int, writeMetrics: ShuffleWriteMetricsReporter): RemoteBlockObjectWriter = { val syncWrites = false // env.blockManager.conf.getBoolean("spark.shuffle.sync", false) new RemoteBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize, syncWrites, writeMetrics, blockId) } }
Example 35
Source File: KinesisInputDStream.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 36
Source File: BlockTransferService.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient} import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }
Example 37
Source File: KinesisInputDStream.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 38
Source File: TaskResult.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.ArrayBuffer import org.apache.spark.SparkEnv import org.apache.spark.serializer.SerializerInstance import org.apache.spark.storage.BlockId import org.apache.spark.util.{AccumulatorV2, Utils} // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] def value(resultSer: SerializerInstance = null): T = { if (valueObjectDeserialized) { valueObject } else { // This should not run when holding a lock because it may cost dozens of seconds for a large // value val ser = if (resultSer == null) SparkEnv.get.serializer.newInstance() else resultSer valueObject = ser.deserialize(valueBytes) valueObjectDeserialized = true valueObject } } }
Example 39
Source File: BlockRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val _locations = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.length).map { i => new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }.toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get[T](blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() _locations(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { _locations } }
Example 40
Source File: TestMemoryManager.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.memory import org.apache.spark.SparkConf import org.apache.spark.storage.BlockId class TestMemoryManager(conf: SparkConf) extends MemoryManager(conf, numCores = 1, Long.MaxValue, Long.MaxValue) { override private[memory] def acquireExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Long = { if (oomOnce) { oomOnce = false 0 } else if (available >= numBytes) { available -= numBytes numBytes } else { val grant = available available = 0 grant } } override def acquireStorageMemory( blockId: BlockId, numBytes: Long, memoryMode: MemoryMode): Boolean = true override def acquireUnrollMemory( blockId: BlockId, numBytes: Long, memoryMode: MemoryMode): Boolean = true override def releaseStorageMemory(numBytes: Long, memoryMode: MemoryMode): Unit = {} override private[memory] def releaseExecutionMemory( numBytes: Long, taskAttemptId: Long, memoryMode: MemoryMode): Unit = { available += numBytes } override def maxOnHeapStorageMemory: Long = Long.MaxValue override def maxOffHeapStorageMemory: Long = 0L private var oomOnce = false private var available = Long.MaxValue def markExecutionAsOutOfMemoryOnce(): Unit = { oomOnce = true } def limit(avail: Long): Unit = { available = avail } }
Example 41
Source File: BlockTransferService.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Promise, Await, Future} import scala.concurrent.duration.Duration import org.apache.spark.Logging import org.apache.spark.network.buffer.{NioManagedBuffer, ManagedBuffer} import org.apache.spark.network.shuffle.{ShuffleClient, BlockFetchingListener} import org.apache.spark.storage.{BlockManagerId, BlockId, StorageLevel} private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel): Unit = { Await.result(uploadBlock(hostname, port, execId, blockId, blockData, level), Duration.Inf) } }
Example 42
Source File: NettyBlockRpcServer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 43
Source File: BlockStoreShuffleFetcher.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.util.{Failure, Success, Try} import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId} import org.apache.spark.util.CompletionIterator private[hash] object BlockStoreShuffleFetcher extends Logging { def fetch[T]( shuffleId: Int, reduceId: Int, context: TaskContext, serializer: Serializer) : Iterator[T] = { logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId)) val blockManager = SparkEnv.get.blockManager val startTime = System.currentTimeMillis val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId) logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format( shuffleId, reduceId, System.currentTimeMillis - startTime)) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]] for (((address, size), index) <- statuses.zipWithIndex) { splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size)) } val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map { case (address, splits) => (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2))) } def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = { val blockId = blockPair._1 val blockOption = blockPair._2 blockOption match { case Success(block) => { block.asInstanceOf[Iterator[T]] } case Failure(e) => { blockId match { case ShuffleBlockId(shufId, mapId, _) => val address = statuses(mapId.toInt)._1 throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) } } } } val blockFetcherItr = new ShuffleBlockFetcherIterator( context, SparkEnv.get.blockManager.shuffleClient, blockManager, blocksByAddress, serializer, SparkEnv.get.conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024) val itr = blockFetcherItr.flatMap(unpackBlock) val completionIter = CompletionIterator[T, Iterator[T]](itr, { context.taskMetrics.updateShuffleReadMetrics() }) new InterruptibleIterator[T](context, completionIter) { val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() override def next(): T = { readMetrics.incRecordsRead(1) delegate.next() } } } }
Example 44
Source File: TaskResult.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.io._ import java.nio.ByteBuffer import scala.collection.mutable.Map import org.apache.spark.SparkEnv import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.BlockId import org.apache.spark.util.Utils // Task result. Also contains updates to accumulator variables. private[spark] sealed trait TaskResult[T] private[spark] class DirectTaskResult[T](var valueBytes: ByteBuffer, var accumUpdates: Map[Long, Any], var metrics: TaskMetrics) extends TaskResult[T] with Externalizable { def this() = this(null.asInstanceOf[ByteBuffer], null, null) override def writeExternal(out: ObjectOutput): Unit = Utils.tryOrIOException { out.writeInt(valueBytes.remaining); Utils.writeByteBuffer(valueBytes, out) out.writeInt(accumUpdates.size) for ((key, value) <- accumUpdates) { out.writeLong(key) out.writeObject(value) } out.writeObject(metrics) } override def readExternal(in: ObjectInput): Unit = Utils.tryOrIOException { val blen = in.readInt() val byteVal = new Array[Byte](blen) in.readFully(byteVal) valueBytes = ByteBuffer.wrap(byteVal) val numUpdates = in.readInt if (numUpdates == 0) { accumUpdates = null } else { accumUpdates = Map() for (i <- 0 until numUpdates) { accumUpdates(in.readLong()) = in.readObject() } } metrics = in.readObject().asInstanceOf[TaskMetrics] } def value(): T = { val resultSer = SparkEnv.get.serializer.newInstance() resultSer.deserialize(valueBytes) } }
Example 45
Source File: BlockRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.storage.{BlockId, BlockManager} import scala.Some private[spark] class BlockRDDPartition(val blockId: BlockId, idx: Int) extends Partition { val index = idx } private[spark] class BlockRDD[T: ClassTag](@transient sc: SparkContext, @transient val blockIds: Array[BlockId]) extends RDD[T](sc, Nil) { @transient lazy val locations_ = BlockManager.blockIdsToHosts(blockIds, SparkEnv.get) @volatile private var _isValid = true override def getPartitions: Array[Partition] = { assertValid() (0 until blockIds.size).map(i => { new BlockRDDPartition(blockIds(i), i).asInstanceOf[Partition] }).toArray } override def compute(split: Partition, context: TaskContext): Iterator[T] = { assertValid() val blockManager = SparkEnv.get.blockManager val blockId = split.asInstanceOf[BlockRDDPartition].blockId blockManager.get(blockId) match { case Some(block) => block.data.asInstanceOf[Iterator[T]] case None => throw new Exception("Could not compute split, block " + blockId + " not found") } } override def getPreferredLocations(split: Partition): Seq[String] = { assertValid() locations_(split.asInstanceOf[BlockRDDPartition].blockId) } private[spark] def assertValid() { if (!_isValid) { throw new SparkException( "Attempted to use %s after its blocks have been removed!".format(toString)) } } protected def getBlockIdLocations(): Map[BlockId, Seq[String]] = { locations_ } }
Example 46
Source File: NettyBlockTransferSecuritySuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio._ import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import scala.concurrent.{Await, Promise} import scala.util.{Failure, Success, Try} import org.apache.commons.io.IOUtils import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.BlockFetchingListener import org.apache.spark.network.{BlockDataManager, BlockTransferService} import org.apache.spark.storage.{BlockId, ShuffleBlockId} import org.apache.spark.{SecurityManager, SparkConf} import org.mockito.Mockito._ import org.scalatest.mock.MockitoSugar import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, ShouldMatchers} class NettyBlockTransferSecuritySuite extends FunSuite with MockitoSugar with ShouldMatchers { test("security default off") { val conf = new SparkConf() .set("spark.app.id", "app-id") testConnection(conf, conf) match { case Success(_) => // expected case Failure(t) => fail(t) } } test("security on same password") { val conf = new SparkConf() .set("spark.authenticate", "true") .set("spark.authenticate.secret", "good") .set("spark.app.id", "app-id") testConnection(conf, conf) match { case Success(_) => // expected case Failure(t) => fail(t) } } test("security on mismatch password") { val conf0 = new SparkConf() .set("spark.authenticate", "true") .set("spark.authenticate.secret", "good") .set("spark.app.id", "app-id") val conf1 = conf0.clone.set("spark.authenticate.secret", "bad") testConnection(conf0, conf1) match { case Success(_) => fail("Should have failed") case Failure(t) => t.getMessage should include ("Mismatched response") } } test("security mismatch auth off on server") { val conf0 = new SparkConf() .set("spark.authenticate", "true") .set("spark.authenticate.secret", "good") .set("spark.app.id", "app-id") val conf1 = conf0.clone.set("spark.authenticate", "false") testConnection(conf0, conf1) match { case Success(_) => fail("Should have failed") case Failure(t) => // any funny error may occur, sever will interpret SASL token as RPC } } test("security mismatch auth off on client") { val conf0 = new SparkConf() .set("spark.authenticate", "false") .set("spark.authenticate.secret", "good") .set("spark.app.id", "app-id") val conf1 = conf0.clone.set("spark.authenticate", "true") testConnection(conf0, conf1) match { case Success(_) => fail("Should have failed") case Failure(t) => t.getMessage should include ("Expected SaslMessage") } } private def fetchBlock( self: BlockTransferService, from: BlockTransferService, execId: String, blockId: BlockId): Try[ManagedBuffer] = { val promise = Promise[ManagedBuffer]() self.fetchBlocks(from.hostName, from.port, execId, Array(blockId.toString), new BlockFetchingListener { override def onBlockFetchFailure(blockId: String, exception: Throwable): Unit = { promise.failure(exception) } override def onBlockFetchSuccess(blockId: String, data: ManagedBuffer): Unit = { promise.success(data.retain()) } }) Await.ready(promise.future, FiniteDuration(1000, TimeUnit.MILLISECONDS)) promise.future.value.get } }
Example 47
Source File: KinesisInputDStream.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.kinesis import scala.reflect.ClassTag import com.amazonaws.services.kinesis.clientlibrary.lib.worker.InitialPositionInStream import com.amazonaws.services.kinesis.model.Record import org.apache.spark.rdd.RDD import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.streaming.{Duration, StreamingContext, Time} import org.apache.spark.streaming.dstream.ReceiverInputDStream import org.apache.spark.streaming.receiver.Receiver import org.apache.spark.streaming.scheduler.ReceivedBlockInfo private[kinesis] class KinesisInputDStream[T: ClassTag]( _ssc: StreamingContext, streamName: String, endpointUrl: String, regionName: String, initialPositionInStream: InitialPositionInStream, checkpointAppName: String, checkpointInterval: Duration, storageLevel: StorageLevel, messageHandler: Record => T, awsCredentialsOption: Option[SerializableAWSCredentials] ) extends ReceiverInputDStream[T](_ssc) { private[streaming] override def createBlockRDD(time: Time, blockInfos: Seq[ReceivedBlockInfo]): RDD[T] = { // This returns true even for when blockInfos is empty val allBlocksHaveRanges = blockInfos.map { _.metadataOption }.forall(_.nonEmpty) if (allBlocksHaveRanges) { // Create a KinesisBackedBlockRDD, even when there are no blocks val blockIds = blockInfos.map { _.blockId.asInstanceOf[BlockId] }.toArray val seqNumRanges = blockInfos.map { _.metadataOption.get.asInstanceOf[SequenceNumberRanges] }.toArray val isBlockIdValid = blockInfos.map { _.isBlockIdValid() }.toArray logDebug(s"Creating KinesisBackedBlockRDD for $time with ${seqNumRanges.length} " + s"seq number ranges: ${seqNumRanges.mkString(", ")} ") new KinesisBackedBlockRDD( context.sc, regionName, endpointUrl, blockIds, seqNumRanges, isBlockIdValid = isBlockIdValid, retryTimeoutMs = ssc.graph.batchDuration.milliseconds.toInt, messageHandler = messageHandler, awsCredentialsOption = awsCredentialsOption) } else { logWarning("Kinesis sequence number information was not present with some block metadata," + " it may not be possible to recover from failures") super.createBlockRDD(time, blockInfos) } } override def getReceiver(): Receiver[T] = { new KinesisReceiver(streamName, endpointUrl, regionName, initialPositionInStream, checkpointAppName, checkpointInterval, storageLevel, messageHandler, awsCredentialsOption) } }
Example 48
Source File: BlockTransferService.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network import java.io.Closeable import java.nio.ByteBuffer import scala.concurrent.{Future, Promise} import scala.concurrent.duration.Duration import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.{BlockFetchingListener, ShuffleClient} import org.apache.spark.storage.{BlockId, StorageLevel} import org.apache.spark.util.ThreadUtils private[spark] abstract class BlockTransferService extends ShuffleClient with Closeable with Logging { def uploadBlockSync( hostname: String, port: Int, execId: String, blockId: BlockId, blockData: ManagedBuffer, level: StorageLevel, classTag: ClassTag[_]): Unit = { val future = uploadBlock(hostname, port, execId, blockId, blockData, level, classTag) ThreadUtils.awaitResult(future, Duration.Inf) } }