org.apache.spark.serializer.Serializer Scala Examples
The following examples show how to use org.apache.spark.serializer.Serializer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: CustomRecoveryModeFactory.scala From spark1.52 with Apache License 2.0 | 5 votes |
// This file is placed in different package to make sure all of these components work well // when they are outside of org.apache.spark. package other.supplier import java.nio.ByteBuffer import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.deploy.master._ import org.apache.spark.serializer.Serializer class CustomRecoveryModeFactory( conf: SparkConf, serializer: Serializer ) extends StandaloneRecoveryModeFactory(conf, serializer) { CustomRecoveryModeFactory.instantiationAttempts += 1 override def read[T: ClassTag](prefix: String): Seq[T] = { CustomPersistenceEngine.readAttempts += 1 //map迭代方式 val results = for ((name, bytes) <- data; if name.startsWith(prefix)) yield serializer.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) results.toSeq } } object CustomPersistenceEngine { @volatile var persistAttempts = 0 @volatile var unpersistAttempts = 0 @volatile var readAttempts = 0 @volatile var lastInstance: Option[CustomPersistenceEngine] = None //最后的实例 } class CustomLeaderElectionAgent(val masterInstance: LeaderElectable) extends LeaderElectionAgent { masterInstance.electedLeader() }
Example 2
Source File: NettyBlockRpcServer.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 3
Source File: HashShuffleReader.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import org.apache.spark.{InterruptibleIterator, TaskContext} import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleReader} import org.apache.spark.util.collection.ExternalSorter private[spark] class HashShuffleReader[K, C]( handle: BaseShuffleHandle[K, _, C], startPartition: Int, endPartition: Int, context: TaskContext) extends ShuffleReader[K, C] { require(endPartition == startPartition + 1, "Hash shuffle currently only supports fetching one partition") private val dep = handle.dependency override def read(): Iterator[Product2[K, C]] = { val ser = Serializer.getSerializer(dep.serializer) val iter = BlockStoreShuffleFetcher.fetch(handle.shuffleId, startPartition, context, ser) val aggregatedIter: Iterator[Product2[K, C]] = if (dep.aggregator.isDefined) { if (dep.mapSideCombine) { new InterruptibleIterator(context, dep.aggregator.get.combineCombinersByKey(iter, context)) } else { new InterruptibleIterator(context, dep.aggregator.get.combineValuesByKey(iter, context)) } } else { require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!") // Convert the Product2s to pairs since this is what downstream RDDs currently expect iter.asInstanceOf[Iterator[Product2[K, C]]].map(pair => (pair._1, pair._2)) } // Sort the output if there is a sort ordering defined. dep.keyOrdering match { case Some(keyOrd: Ordering[K]) => // Create an ExternalSorter to sort the data. Note that if spark.shuffle.spill is disabled, // the ExternalSorter won't spill to disk. val sorter = new ExternalSorter[K, C, C](ordering = Some(keyOrd), serializer = Some(ser)) sorter.insertAll(aggregatedIter) context.taskMetrics.incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics.incDiskBytesSpilled(sorter.diskBytesSpilled) sorter.iterator case None => aggregatedIter } } }
Example 4
Source File: BlockStoreShuffleFetcher.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.util.{Failure, Success, Try} import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId} import org.apache.spark.util.CompletionIterator private[hash] object BlockStoreShuffleFetcher extends Logging { def fetch[T]( shuffleId: Int, reduceId: Int, context: TaskContext, serializer: Serializer) : Iterator[T] = { logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId)) val blockManager = SparkEnv.get.blockManager val startTime = System.currentTimeMillis val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId) logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format( shuffleId, reduceId, System.currentTimeMillis - startTime)) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]] for (((address, size), index) <- statuses.zipWithIndex) { splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size)) } val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map { case (address, splits) => (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2))) } def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = { val blockId = blockPair._1 val blockOption = blockPair._2 blockOption match { case Success(block) => { block.asInstanceOf[Iterator[T]] } case Failure(e) => { blockId match { case ShuffleBlockId(shufId, mapId, _) => val address = statuses(mapId.toInt)._1 throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) } } } } val blockFetcherItr = new ShuffleBlockFetcherIterator( context, SparkEnv.get.blockManager.shuffleClient, blockManager, blocksByAddress, serializer, // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024) val itr = blockFetcherItr.flatMap(unpackBlock) val completionIter = CompletionIterator[T, Iterator[T]](itr, { context.taskMetrics.updateShuffleReadMetrics() }) new InterruptibleIterator[T](context, completionIter) { val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() override def next(): T = { readMetrics.incRecordsRead(1) delegate.next() } } } }
Example 5
Source File: HashShuffleWriter.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle._ import org.apache.spark.storage.BlockObjectWriter private[spark] class HashShuffleWriter[K, V]( shuffleBlockResolver: FileShuffleBlockResolver, handle: BaseShuffleHandle[K, V, _], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val numOutputSplits = dep.partitioner.numPartitions private val metrics = context.taskMetrics // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private val writeMetrics = new ShuffleWriteMetrics() metrics.shuffleWriteMetrics = Some(writeMetrics) private val blockManager = SparkEnv.get.blockManager private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null)) private val shuffle = shuffleBlockResolver.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser, writeMetrics) override def stop(initiallySuccess: Boolean): Option[MapStatus] = { var success = initiallySuccess try { if (stopping) { return None } stopping = true if (success) { try { Some(commitWritesAndBuildStatus()) } catch { case e: Exception => success = false revertWrites() throw e } } else { revertWrites() None } } finally { // Release the writers back to the shuffle block manager. if (shuffle != null && shuffle.writers != null) { try { shuffle.releaseWriters(success) } catch { case e: Exception => logError("Failed to release shuffle writers", e) } } } } private def commitWritesAndBuildStatus(): MapStatus = { // Commit the writes. Get the size of each bucket block (total block size). val sizes: Array[Long] = shuffle.writers.map { writer: BlockObjectWriter => writer.commitAndClose() writer.fileSegment().length } MapStatus(blockManager.shuffleServerId, sizes) } private def revertWrites(): Unit = { if (shuffle != null && shuffle.writers != null) { for (writer <- shuffle.writers) { writer.revertPartialWritesAndClose() } } } }
Example 6
Source File: Dependency.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.ShuffleHandle @DeveloperApi class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int) extends NarrowDependency[T](rdd) { override def getParents(partitionId: Int): List[Int] = { if (partitionId >= outStart && partitionId < outStart + length) { List(partitionId - outStart + inStart) } else { Nil } } }
Example 7
Source File: ShuffledRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = idx } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 8
Source File: InstantiateClass.scala From spark1.52 with Apache License 2.0 | 5 votes |
package sparkDemo import org.apache.spark.{Logging, SparkConf} import org.apache.spark.SparkEnv.{logDebug, logInfo} import org.apache.spark.rpc.{RpcEndpoint, RpcEndpointRef} import org.apache.spark.serializer.Serializer import org.apache.spark.util.RpcUtils object InstantiateClass extends App with Logging { val conf: SparkConf = new SparkConf() def instantiateClass[T](className: String): T = { logInfo(s"className: ${className}") val cls = classForName(className) logInfo(s"cls: ${cls}") // Look for a constructor taking a SparkConf and a boolean isDriver, then one taking just // SparkConf, then one taking no arguments //寻找一个构造函数,使用一个SparkConf和一个布尔值为isDriver的代码,然后需要一个参数Boolean的SparkConf构造函数 //查找一个sparkconf构造函数,是否isDriver try { //classOf类强制类型转换SparkConf类,classOf[T]`等同于Java中的类文字`T.class`。 logInfo(s"classOf[SparkConf]: ${classOf[SparkConf]}") val tset=cls.getConstructor(classOf[SparkConf], java.lang.Boolean.TYPE) .newInstance(conf, new java.lang.Boolean(true)) //asInstanceOf强制类型[T]对象 .asInstanceOf[T] logInfo(s"asInstanceOf[T]: ${tset.toString}") cls.getConstructor(classOf[SparkConf], java.lang.Boolean.TYPE) .newInstance(conf, new java.lang.Boolean(true)) //asInstanceOf强制类型[T]对象 .asInstanceOf[T] } catch { case _: NoSuchMethodException => try { logInfo(s"asInstanceOf[T]: ${cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[T]}") cls.getConstructor(classOf[SparkConf]).newInstance(conf).asInstanceOf[T] } catch { case _: NoSuchMethodException => logInfo(s"1111111 asInstanceOf[T]: ${cls.getConstructor().newInstance()}") logInfo(s"asInstanceOf[T]: ${cls.getConstructor().newInstance().asInstanceOf[T]}") cls.getConstructor().newInstance().asInstanceOf[T] } } } def classForName(className: String): Class[_] = { val classLoader = getContextOrSparkClassLoader logInfo(s"classLoader: ${classLoader}") Class.forName(className, true, getContextOrSparkClassLoader) // scalastyle:on classforname } def getContextOrSparkClassLoader: ClassLoader = { //Thread.currentThread().getContextClassLoader,可以获取当前线程的引用,getContextClassLoader用来获取线程的上下文类加载器 val ContextClassLoader=Thread.currentThread().getContextClassLoader logInfo(s"ContextClassLoader: ${ContextClassLoader}") //Thread.currentThread().getContextClassLoader,可以获取当前线程的引用,getContextClassLoader用来获取线程的上下文类加载器 Option(Thread.currentThread().getContextClassLoader).getOrElse(getSparkClassLoader) } def getSparkClassLoader: ClassLoader ={ logInfo(s"getClass.getClassLoader: ${ getClass.getClassLoader}") getClass.getClassLoader } def instantiateClassFromConf[T](propertyName: String, defaultClassName: String): T = { instantiateClass[T](conf.get(propertyName, defaultClassName)) } val serializer = instantiateClassFromConf[Serializer]( "spark.serializer", "org.apache.spark.serializer.JavaSerializer") logInfo(s"Using serializer: ${serializer.getClass}") println("====="+serializer.getClass) }
Example 9
Source File: ShuffledRowRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.DataType private class ShuffledRowRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = idx } class ShuffledRowRDD( @transient var prev: RDD[Product2[Int, InternalRow]], serializer: Serializer, numPartitions: Int) extends RDD[InternalRow](prev.context, Nil) { private val part: Partitioner = new PartitionIdPassthrough(numPartitions) override def getDependencies: Seq[Dependency[_]] = { List(new ShuffleDependency[Int, InternalRow, InternalRow](prev, part, Some(serializer))) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRowRDDPartition(i)) } override def compute(split: Partition, context: TaskContext): Iterator[InternalRow] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[Int, InternalRow, InternalRow]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[Product2[Int, InternalRow]]] .map(_._2) } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 10
Source File: NettyBlockRpcServer.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { //消息解码 val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { //提供下载Block文件的功能, case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = //数据blockIds,存放BlockId,获得块数据 openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) //提供上传Block文件的RPC服务 case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. //使用我们的JavaSerializer将StorageLevel序列化为字节 //存储级别 val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) //存储局部块,使用给定的存储级别 blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 11
Source File: FileSystemPersistenceEngine.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { new File(dir + File.separator + name).delete() } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 12
Source File: RecoveryModeFactory.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.{Logging, SparkConf} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { //Spark保存恢复状态的目录 val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 13
Source File: ZooKeeperPersistenceEngine.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConversions._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { //zooKeeper保存恢复状态的目录,缺省为/spark private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { val file = zk.getChildren.forPath(WORKING_DIR).filter(_.startsWith(prefix)) file.map(deserializeFromFile[T]).flatten } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => { logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } } }
Example 14
Source File: ShuffledRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = idx } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] //shuffleManager.getReader返回HashShuffle中的reader函数的具体实现 //首先调用SortShuffleManager的getReader方法创建HashShuffleReader,然后执行HashShuffleReader的read方法 //读取依赖任务的中间计算结果 SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 15
Source File: CustomRecoveryModeFactory.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// This file is placed in different package to make sure all of these components work well // when they are outside of org.apache.spark. package other.supplier import java.nio.ByteBuffer import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.deploy.master._ import org.apache.spark.serializer.Serializer class CustomRecoveryModeFactory( conf: SparkConf, serializer: Serializer ) extends StandaloneRecoveryModeFactory(conf, serializer) { CustomRecoveryModeFactory.instantiationAttempts += 1 override def read[T: ClassTag](prefix: String): Seq[T] = { CustomPersistenceEngine.readAttempts += 1 val results = for ((name, bytes) <- data; if name.startsWith(prefix)) yield serializer.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) results.toSeq } } object CustomPersistenceEngine { @volatile var persistAttempts = 0 @volatile var unpersistAttempts = 0 @volatile var readAttempts = 0 @volatile var lastInstance: Option[CustomPersistenceEngine] = None } class CustomLeaderElectionAgent(val masterInstance: LeaderElectable) extends LeaderElectionAgent { masterInstance.electedLeader() }
Example 16
Source File: NettyBlockRpcServer.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.NioManagedBuffer import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length val blocks = for (i <- (0 until blocksNum).view) yield blockManager.getBlockData(BlockId.apply(openBlocks.blockIds(i))) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with $blocksNum buffers") responseContext.onSuccess(new StreamHandle(streamId, blocksNum).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 17
Source File: FileSystemPersistenceEngine.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { logWarning(s"Error deleting ${f.getPath()}") } } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 18
Source File: RecoveryModeFactory.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 19
Source File: ZooKeeperPersistenceEngine.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { zk.getChildren.forPath(WORKING_DIR).asScala .filter(_.startsWith(prefix)).flatMap(deserializeFromFile[T]) } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } }
Example 20
Source File: ShuffledRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { val serializer = userSpecifiedSerializer.getOrElse { val serializerManager = SparkEnv.get.serializerManager if (mapSideCombine) { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]]) } else { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]]) } } List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override protected def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 21
Source File: CustomRecoveryModeFactory.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// This file is placed in different package to make sure all of these components work well // when they are outside of org.apache.spark. package other.supplier import java.nio.ByteBuffer import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.deploy.master._ import org.apache.spark.serializer.Serializer class CustomRecoveryModeFactory( conf: SparkConf, serializer: Serializer ) extends StandaloneRecoveryModeFactory(conf, serializer) { CustomRecoveryModeFactory.instantiationAttempts += 1 override def read[T: ClassTag](prefix: String): Seq[T] = { CustomPersistenceEngine.readAttempts += 1 val results = for ((name, bytes) <- data; if name.startsWith(prefix)) yield serializer.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) results.toSeq } } object CustomPersistenceEngine { @volatile var persistAttempts = 0 @volatile var unpersistAttempts = 0 @volatile var readAttempts = 0 @volatile var lastInstance: Option[CustomPersistenceEngine] = None } class CustomLeaderElectionAgent(val masterInstance: LeaderElectable) extends LeaderElectionAgent { masterInstance.electedLeader() }
Example 22
Source File: NettyBlockRpcServer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 23
Source File: FileSystemPersistenceEngine.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { logWarning(s"Error deleting ${f.getPath()}") } } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 24
Source File: RecoveryModeFactory.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.{Logging, SparkConf} import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 25
Source File: ZooKeeperPersistenceEngine.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.{Logging, SparkConf} import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { zk.getChildren.forPath(WORKING_DIR).asScala .filter(_.startsWith(prefix)).map(deserializeFromFile[T]).flatten } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => { logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } } }
Example 26
Source File: ShuffledRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = idx } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override protected def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 27
Source File: CustomRecoveryModeFactory.scala From BigDatalog with Apache License 2.0 | 5 votes |
// This file is placed in different package to make sure all of these components work well // when they are outside of org.apache.spark. package other.supplier import java.nio.ByteBuffer import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.deploy.master._ import org.apache.spark.serializer.Serializer class CustomRecoveryModeFactory( conf: SparkConf, serializer: Serializer ) extends StandaloneRecoveryModeFactory(conf, serializer) { CustomRecoveryModeFactory.instantiationAttempts += 1 override def read[T: ClassTag](prefix: String): Seq[T] = { CustomPersistenceEngine.readAttempts += 1 val results = for ((name, bytes) <- data; if name.startsWith(prefix)) yield serializer.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) results.toSeq } } object CustomPersistenceEngine { @volatile var persistAttempts = 0 @volatile var unpersistAttempts = 0 @volatile var readAttempts = 0 @volatile var lastInstance: Option[CustomPersistenceEngine] = None } class CustomLeaderElectionAgent(val masterInstance: LeaderElectable) extends LeaderElectionAgent { masterInstance.electedLeader() }
Example 28
Source File: CrailSparkShuffleSorter.scala From crail-spark-io with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle import org.apache.spark.TaskContext import org.apache.spark.common.Logging import org.apache.spark.serializer.{CrailDeserializationStream, Serializer} import org.apache.spark.util.CompletionIterator import org.apache.spark.util.collection.ExternalSorter class CrailSparkShuffleSorter extends CrailShuffleSorter with Logging { logInfo("crail shuffle spark sorter") override def sort[K, C](context: TaskContext, keyOrd: Ordering[K], ser : Serializer, inputSerializer: CrailDeserializationStream): Iterator[Product2[K, C]] = { val sorter = new ExternalSorter[K, C, C](context, ordering = Some(keyOrd), serializer = ser) val input = inputSerializer.asKeyValueIterator.asInstanceOf[Iterator[Product2[K, C]]] sorter.insertAll(input) context.taskMetrics().incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics().incDiskBytesSpilled(sorter.diskBytesSpilled) context.taskMetrics().incPeakExecutionMemory(sorter.peakMemoryUsedBytes) CompletionIterator[Product2[K, C], Iterator[Product2[K, C]]](sorter.iterator, sorter.stop()) } }
Example 29
Source File: CustomRecoveryModeFactory.scala From sparkoscope with Apache License 2.0 | 5 votes |
// This file is placed in different package to make sure all of these components work well // when they are outside of org.apache.spark. package other.supplier import java.nio.ByteBuffer import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.deploy.master._ import org.apache.spark.serializer.Serializer class CustomRecoveryModeFactory( conf: SparkConf, serializer: Serializer ) extends StandaloneRecoveryModeFactory(conf, serializer) { CustomRecoveryModeFactory.instantiationAttempts += 1 override def read[T: ClassTag](prefix: String): Seq[T] = { CustomPersistenceEngine.readAttempts += 1 val results = for ((name, bytes) <- data; if name.startsWith(prefix)) yield serializer.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) results.toSeq } } object CustomPersistenceEngine { @volatile var persistAttempts = 0 @volatile var unpersistAttempts = 0 @volatile var readAttempts = 0 @volatile var lastInstance: Option[CustomPersistenceEngine] = None } class CustomLeaderElectionAgent(val masterInstance: LeaderElectable) extends LeaderElectionAgent { masterInstance.electedLeader() }
Example 30
Source File: FileSystemPersistenceEngine.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { logWarning(s"Error deleting ${f.getPath()}") } } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 31
Source File: RecoveryModeFactory.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 32
Source File: ZooKeeperPersistenceEngine.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { zk.getChildren.forPath(WORKING_DIR).asScala .filter(_.startsWith(prefix)).flatMap(deserializeFromFile[T]) } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } }
Example 33
Source File: Dependency.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import scala.reflect.ClassTag import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.ShuffleHandle @DeveloperApi class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int) extends NarrowDependency[T](rdd) { override def getParents(partitionId: Int): List[Int] = { if (partitionId >= outStart && partitionId < outStart + length) { List(partitionId - outStart + inStart) } else { Nil } } }
Example 34
Source File: ShuffledRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = index override def equals(other: Any): Boolean = super.equals(other) } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { val serializer = userSpecifiedSerializer.getOrElse { val serializerManager = SparkEnv.get.serializerManager if (mapSideCombine) { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]]) } else { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]]) } } List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override protected def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 35
Source File: CustomRecoveryModeFactory.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// This file is placed in different package to make sure all of these components work well // when they are outside of org.apache.spark. package other.supplier import java.nio.ByteBuffer import scala.collection.mutable import scala.reflect.ClassTag import org.apache.spark.SparkConf import org.apache.spark.deploy.master._ import org.apache.spark.serializer.Serializer class CustomRecoveryModeFactory( conf: SparkConf, serializer: Serializer ) extends StandaloneRecoveryModeFactory(conf, serializer) { CustomRecoveryModeFactory.instantiationAttempts += 1 override def read[T: ClassTag](prefix: String): Seq[T] = { CustomPersistenceEngine.readAttempts += 1 val results = for ((name, bytes) <- data; if name.startsWith(prefix)) yield serializer.newInstance().deserialize[T](ByteBuffer.wrap(bytes)) results.toSeq } } object CustomPersistenceEngine { @volatile var persistAttempts = 0 @volatile var unpersistAttempts = 0 @volatile var readAttempts = 0 @volatile var lastInstance: Option[CustomPersistenceEngine] = None } class CustomLeaderElectionAgent(val masterInstance: LeaderElectable) extends LeaderElectionAgent { masterInstance.electedLeader() }
Example 36
Source File: MapJoinPartitionsRDDV2.scala From spark-vlbfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.io.{IOException, ObjectOutputStream} import org.apache.spark.serializer.Serializer import org.apache.spark.{TaskContext, _} import org.apache.spark.util.Utils import scala.reflect.ClassTag class MapJoinPartitionsPartitionV2( idx: Int, @transient private val rdd1: RDD[_], @transient private val rdd2: RDD[_], s2IdxArr: Array[Int]) extends Partition { var s1 = rdd1.partitions(idx) var s2Arr = s2IdxArr.map(s2Idx => rdd2.partitions(s2Idx)) override val index: Int = idx @throws(classOf[IOException]) private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException { s1 = rdd1.partitions(idx) s2Arr = s2IdxArr.map(s2Idx => rdd2.partitions(s2Idx)) oos.defaultWriteObject() } } class MapJoinPartitionsRDDV2[A: ClassTag, B: ClassTag, V: ClassTag]( sc: SparkContext, var idxF: (Int) => Array[Int], var f: (Int, Iterator[A], Array[(Int, Iterator[B])]) => Iterator[V], var rdd1: RDD[A], var rdd2: RDD[B], preservesPartitioning: Boolean = false) extends RDD[V](sc, Nil) { var rdd2WithPid = rdd2.mapPartitionsWithIndex((pid, iter) => iter.map(x => (pid, x))) private val serializer: Serializer = SparkEnv.get.serializer override def getPartitions: Array[Partition] = { val array = new Array[Partition](rdd1.partitions.length) for (s1 <- rdd1.partitions) { val idx = s1.index array(idx) = new MapJoinPartitionsPartitionV2(idx, rdd1, rdd2, idxF(idx)) } array } override def getDependencies: Seq[Dependency[_]] = List( new OneToOneDependency(rdd1), new ShuffleDependency[Int, B, B]( rdd2WithPid.asInstanceOf[RDD[_ <: Product2[Int, B]]], new IdentityPartitioner(rdd2WithPid.getNumPartitions), serializer) ) override def getPreferredLocations(s: Partition): Seq[String] = { val fp = firstParent[A] // println(s"pref loc: ${fp.preferredLocations(fp.partitions(s.index))}") fp.preferredLocations(fp.partitions(s.index)) } override def compute(split: Partition, context: TaskContext): Iterator[V] = { val currSplit = split.asInstanceOf[MapJoinPartitionsPartitionV2] val rdd2Dep = dependencies(1).asInstanceOf[ShuffleDependency[Int, Any, Any]] val rdd2PartIter = currSplit.s2Arr.map(s2 => (s2.index, SparkEnv.get.shuffleManager .getReader[Int, B](rdd2Dep.shuffleHandle, s2.index, s2.index + 1, context) .read().map(x => x._2) )) val rdd1Iter = rdd1.iterator(currSplit.s1, context) f(currSplit.s1.index, rdd1Iter, rdd2PartIter) } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null rdd2WithPid = null idxF = null f = null } } private[spark] class IdentityPartitioner(val numParts: Int) extends Partitioner { require(numPartitions > 0) override def getPartition(key: Any): Int = key.asInstanceOf[Int] override def numPartitions: Int = numParts }
Example 37
Source File: MyNettyBlockRpcServer.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.language.existentials import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.client.{RpcResponseCallback, StreamCallbackWithID, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.remote.{HadoopFileSegmentManagedBuffer, MessageForHadoopManagedBuffers, RemoteShuffleManager} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.storage.{BlockId, ShuffleBlockId} class MyNettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length val isShuffleRequest = (blocksNum > 0) && BlockId.apply(openBlocks.blockIds(0)).isInstanceOf[ShuffleBlockId] && (SparkEnv.get.conf.get("spark.shuffle.manager", classOf[SortShuffleManager].getName) == classOf[RemoteShuffleManager].getName) if (isShuffleRequest) { val blockIdAndManagedBufferPair = openBlocks.blockIds.map(block => (block, blockManager.getHostLocalShuffleData( BlockId.apply(block), Array.empty).asInstanceOf[HadoopFileSegmentManagedBuffer])) responseContext.onSuccess(new MessageForHadoopManagedBuffers( blockIdAndManagedBufferPair).toByteBuffer.nioBuffer()) } else { // This customized Netty RPC server is only served for RemoteShuffle requests, // Other RPC messages or data chunks transferring should go through // NettyBlockTransferService' NettyBlockRpcServer throw new UnsupportedOperationException("MyNettyBlockRpcServer only serves remote" + " shuffle requests for OpenBlocks") } case uploadBlock: UploadBlock => throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't serve UploadBlock") } } override def receiveStream( client: TransportClient, messageHeader: ByteBuffer, responseContext: RpcResponseCallback): StreamCallbackWithID = { throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't support receiving" + " stream") } override def getStreamManager(): StreamManager = streamManager }
Example 38
Source File: NettyBlockRpcServer.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 39
Source File: FileSystemPersistenceEngine.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { logWarning(s"Error deleting ${f.getPath()}") } } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 40
Source File: RecoveryModeFactory.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 41
Source File: ZooKeeperPersistenceEngine.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { zk.getChildren.forPath(WORKING_DIR).asScala .filter(_.startsWith(prefix)).flatMap(deserializeFromFile[T]) } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } }
Example 42
Source File: ShuffledRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = index override def equals(other: Any): Boolean = super.equals(other) } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { val serializer = userSpecifiedSerializer.getOrElse { val serializerManager = SparkEnv.get.serializerManager if (mapSideCombine) { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]]) } else { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]]) } } List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override protected def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get.mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 43
Source File: NettyBlockRpcServer.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, MapOutputReady, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.scheduler.MapStatus import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) case mapOutputReady: MapOutputReady => val mapStatus: MapStatus = serializer.newInstance().deserialize(ByteBuffer.wrap(mapOutputReady.serializedMapStatus)) blockManager.mapOutputReady( mapOutputReady.shuffleId, mapOutputReady.mapId, mapOutputReady.numReduces, mapStatus) } } override def getStreamManager(): StreamManager = streamManager }
Example 44
Source File: NettyBlockRpcServer.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConversions._ import org.apache.spark.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, messageBytes: Array[Byte], responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteArray(messageBytes) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(blocks.iterator) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteArray) case uploadBlock: UploadBlock => // StorageLevel is serialized as bytes using our JavaSerializer. val level: StorageLevel = serializer.newInstance().deserialize(ByteBuffer.wrap(uploadBlock.metadata)) val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) blockManager.putBlockData(BlockId(uploadBlock.blockId), data, level) responseContext.onSuccess(new Array[Byte](0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 45
Source File: HashShuffleReader.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import org.apache.spark.{InterruptibleIterator, TaskContext} import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.{BaseShuffleHandle, ShuffleReader} import org.apache.spark.util.collection.ExternalSorter private[spark] class HashShuffleReader[K, C]( handle: BaseShuffleHandle[K, _, C], startPartition: Int, endPartition: Int, context: TaskContext) extends ShuffleReader[K, C] { require(endPartition == startPartition + 1, "Hash shuffle currently only supports fetching one partition") private val dep = handle.dependency override def read(): Iterator[Product2[K, C]] = { val ser = Serializer.getSerializer(dep.serializer) val iter = BlockStoreShuffleFetcher.fetch(handle.shuffleId, startPartition, context, ser) val aggregatedIter: Iterator[Product2[K, C]] = if (dep.aggregator.isDefined) { if (dep.mapSideCombine) { new InterruptibleIterator(context, dep.aggregator.get.combineCombinersByKey(iter, context)) } else { new InterruptibleIterator(context, dep.aggregator.get.combineValuesByKey(iter, context)) } } else { require(!dep.mapSideCombine, "Map-side combine without Aggregator specified!") // Convert the Product2s to pairs since this is what downstream RDDs currently expect iter.asInstanceOf[Iterator[Product2[K, C]]].map(pair => (pair._1, pair._2)) } // Sort the output if there is a sort ordering defined. dep.keyOrdering match { case Some(keyOrd: Ordering[K]) => // Create an ExternalSorter to sort the data. Note that if spark.shuffle.spill is disabled, // the ExternalSorter won't spill to disk. val sorter = new ExternalSorter[K, C, C](ordering = Some(keyOrd), serializer = Some(ser)) sorter.insertAll(aggregatedIter) context.taskMetrics.incMemoryBytesSpilled(sorter.memoryBytesSpilled) context.taskMetrics.incDiskBytesSpilled(sorter.diskBytesSpilled) sorter.iterator case None => aggregatedIter } } }
Example 46
Source File: BlockStoreShuffleFetcher.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.util.{Failure, Success, Try} import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId} import org.apache.spark.util.CompletionIterator private[hash] object BlockStoreShuffleFetcher extends Logging { def fetch[T]( shuffleId: Int, reduceId: Int, context: TaskContext, serializer: Serializer) : Iterator[T] = { logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId)) val blockManager = SparkEnv.get.blockManager val startTime = System.currentTimeMillis val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId) logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format( shuffleId, reduceId, System.currentTimeMillis - startTime)) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]] for (((address, size), index) <- statuses.zipWithIndex) { splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size)) } val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map { case (address, splits) => (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2))) } def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = { val blockId = blockPair._1 val blockOption = blockPair._2 blockOption match { case Success(block) => { block.asInstanceOf[Iterator[T]] } case Failure(e) => { blockId match { case ShuffleBlockId(shufId, mapId, _) => val address = statuses(mapId.toInt)._1 throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) } } } } val blockFetcherItr = new ShuffleBlockFetcherIterator( context, SparkEnv.get.blockManager.shuffleClient, blockManager, blocksByAddress, serializer, SparkEnv.get.conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024) val itr = blockFetcherItr.flatMap(unpackBlock) val completionIter = CompletionIterator[T, Iterator[T]](itr, { context.taskMetrics.updateShuffleReadMetrics() }) new InterruptibleIterator[T](context, completionIter) { val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() override def next(): T = { readMetrics.incRecordsRead(1) delegate.next() } } } }
Example 47
Source File: HashShuffleWriter.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle._ import org.apache.spark.storage.BlockObjectWriter private[spark] class HashShuffleWriter[K, V]( shuffleBlockManager: FileShuffleBlockManager, handle: BaseShuffleHandle[K, V, _], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val numOutputSplits = dep.partitioner.numPartitions private val metrics = context.taskMetrics // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private val writeMetrics = new ShuffleWriteMetrics() metrics.shuffleWriteMetrics = Some(writeMetrics) private val blockManager = SparkEnv.get.blockManager private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null)) private val shuffle = shuffleBlockManager.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser, writeMetrics) override def stop(initiallySuccess: Boolean): Option[MapStatus] = { var success = initiallySuccess try { if (stopping) { return None } stopping = true if (success) { try { Some(commitWritesAndBuildStatus()) } catch { case e: Exception => success = false revertWrites() throw e } } else { revertWrites() None } } finally { // Release the writers back to the shuffle block manager. if (shuffle != null && shuffle.writers != null) { try { shuffle.releaseWriters(success) } catch { case e: Exception => logError("Failed to release shuffle writers", e) } } } } private def commitWritesAndBuildStatus(): MapStatus = { // Commit the writes. Get the size of each bucket block (total block size). val sizes: Array[Long] = shuffle.writers.map { writer: BlockObjectWriter => writer.commitAndClose() writer.fileSegment().length } MapStatus(blockManager.shuffleServerId, sizes) } private def revertWrites(): Unit = { if (shuffle != null && shuffle.writers != null) { for (writer <- shuffle.writers) { writer.revertPartialWritesAndClose() } } } }
Example 48
Source File: Dependency.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark import org.apache.spark.annotation.DeveloperApi import org.apache.spark.rdd.RDD import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.ShuffleHandle @DeveloperApi class RangeDependency[T](rdd: RDD[T], inStart: Int, outStart: Int, length: Int) extends NarrowDependency[T](rdd) { override def getParents(partitionId: Int) = { if (partitionId >= outStart && partitionId < outStart + length) { List(partitionId - outStart + inStart) } else { Nil } } }
Example 49
Source File: ShuffledRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index = idx override def hashCode(): Int = idx } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv.get.shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }
Example 50
Source File: SubtractedRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions._ import scala.collection.mutable.ArrayBuffer import scala.reflect.ClassTag import org.apache.spark.Dependency import org.apache.spark.OneToOneDependency import org.apache.spark.Partition import org.apache.spark.Partitioner import org.apache.spark.ShuffleDependency import org.apache.spark.SparkEnv import org.apache.spark.TaskContext import org.apache.spark.serializer.Serializer def setSerializer(serializer: Serializer): SubtractedRDD[K, V, W] = { this.serializer = Option(serializer) this } override def getDependencies: Seq[Dependency[_]] = { Seq(rdd1, rdd2).map { rdd => if (rdd.partitioner == Some(part)) { logDebug("Adding one-to-one dependency with " + rdd) new OneToOneDependency(rdd) } else { logDebug("Adding shuffle dependency with " + rdd) new ShuffleDependency(rdd, part, serializer) } } } override def getPartitions: Array[Partition] = { val array = new Array[Partition](part.numPartitions) for (i <- 0 until array.size) { // Each CoGroupPartition will depend on rdd1 and rdd2 array(i) = new CoGroupPartition(i, Seq(rdd1, rdd2).zipWithIndex.map { case (rdd, j) => dependencies(j) match { case s: ShuffleDependency[_, _, _] => new ShuffleCoGroupSplitDep(s.shuffleHandle) case _ => new NarrowCoGroupSplitDep(rdd, i, rdd.partitions(i)) } }.toArray) } array } override val partitioner = Some(part) override def compute(p: Partition, context: TaskContext): Iterator[(K, V)] = { val partition = p.asInstanceOf[CoGroupPartition] val map = new JHashMap[K, ArrayBuffer[V]] def getSeq(k: K): ArrayBuffer[V] = { val seq = map.get(k) if (seq != null) { seq } else { val seq = new ArrayBuffer[V]() map.put(k, seq) seq } } def integrate(dep: CoGroupSplitDep, op: Product2[K, V] => Unit) = dep match { case NarrowCoGroupSplitDep(rdd, _, itsSplit) => rdd.iterator(itsSplit, context).asInstanceOf[Iterator[Product2[K, V]]].foreach(op) case ShuffleCoGroupSplitDep(handle) => val iter = SparkEnv.get.shuffleManager .getReader(handle, partition.index, partition.index + 1, context) .read() iter.foreach(op) } // the first dep is rdd1; add all values to the map integrate(partition.deps(0), t => getSeq(t._1) += t._2) // the second dep is rdd2; remove all of its keys integrate(partition.deps(1), t => map.remove(t._1)) map.iterator.map { t => t._2.iterator.map { (t._1, _) } }.flatten } override def clearDependencies() { super.clearDependencies() rdd1 = null rdd2 = null } }
Example 51
Source File: NettyBlockRpcServer.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.language.existentials import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.client.{RpcResponseCallback, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol.{BlockTransferMessage, OpenBlocks, StreamHandle, UploadBlock} import org.apache.spark.serializer.Serializer import org.apache.spark.storage.{BlockId, StorageLevel} class NettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocks: Seq[ManagedBuffer] = openBlocks.blockIds.map(BlockId.apply).map(blockManager.getBlockData) val streamId = streamManager.registerStream(appId, blocks.iterator.asJava) logTrace(s"Registered streamId $streamId with ${blocks.size} buffers") responseContext.onSuccess(new StreamHandle(streamId, blocks.size).toByteBuffer) case uploadBlock: UploadBlock => // StorageLevel and ClassTag are serialized as bytes using our JavaSerializer. val (level: StorageLevel, classTag: ClassTag[_]) = { serializer .newInstance() .deserialize(ByteBuffer.wrap(uploadBlock.metadata)) .asInstanceOf[(StorageLevel, ClassTag[_])] } val data = new NioManagedBuffer(ByteBuffer.wrap(uploadBlock.blockData)) val blockId = BlockId(uploadBlock.blockId) blockManager.putBlockData(blockId, data, level, classTag) responseContext.onSuccess(ByteBuffer.allocate(0)) } } override def getStreamManager(): StreamManager = streamManager }
Example 52
Source File: FileSystemPersistenceEngine.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.io._ import scala.reflect.ClassTag import org.apache.spark.internal.Logging import org.apache.spark.serializer.{DeserializationStream, SerializationStream, Serializer} import org.apache.spark.util.Utils private[master] class FileSystemPersistenceEngine( val dir: String, val serializer: Serializer) extends PersistenceEngine with Logging { new File(dir).mkdir() override def persist(name: String, obj: Object): Unit = { serializeIntoFile(new File(dir + File.separator + name), obj) } override def unpersist(name: String): Unit = { val f = new File(dir + File.separator + name) if (!f.delete()) { logWarning(s"Error deleting ${f.getPath()}") } } override def read[T: ClassTag](prefix: String): Seq[T] = { val files = new File(dir).listFiles().filter(_.getName.startsWith(prefix)) files.map(deserializeFromFile[T]) } private def serializeIntoFile(file: File, value: AnyRef) { val created = file.createNewFile() if (!created) { throw new IllegalStateException("Could not create file: " + file) } val fileOut = new FileOutputStream(file) var out: SerializationStream = null Utils.tryWithSafeFinally { out = serializer.newInstance().serializeStream(fileOut) out.writeObject(value) } { fileOut.close() if (out != null) { out.close() } } } private def deserializeFromFile[T](file: File)(implicit m: ClassTag[T]): T = { val fileIn = new FileInputStream(file) var in: DeserializationStream = null try { in = serializer.newInstance().deserializeStream(fileIn) in.readObject[T]() } finally { fileIn.close() if (in != null) { in.close() } } } }
Example 53
Source File: RecoveryModeFactory.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import org.apache.spark.SparkConf import org.apache.spark.annotation.DeveloperApi import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class FileSystemRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) with Logging { val RECOVERY_DIR = conf.get("spark.deploy.recoveryDirectory", "") def createPersistenceEngine(): PersistenceEngine = { logInfo("Persisting recovery state to directory: " + RECOVERY_DIR) new FileSystemPersistenceEngine(RECOVERY_DIR, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new MonarchyLeaderAgent(master) } } private[master] class ZooKeeperRecoveryModeFactory(conf: SparkConf, serializer: Serializer) extends StandaloneRecoveryModeFactory(conf, serializer) { def createPersistenceEngine(): PersistenceEngine = { new ZooKeeperPersistenceEngine(conf, serializer) } def createLeaderElectionAgent(master: LeaderElectable): LeaderElectionAgent = { new ZooKeeperLeaderElectionAgent(master, conf) } }
Example 54
Source File: ZooKeeperPersistenceEngine.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.master import java.nio.ByteBuffer import scala.collection.JavaConverters._ import scala.reflect.ClassTag import org.apache.curator.framework.CuratorFramework import org.apache.zookeeper.CreateMode import org.apache.spark.SparkConf import org.apache.spark.deploy.SparkCuratorUtil import org.apache.spark.internal.Logging import org.apache.spark.serializer.Serializer private[master] class ZooKeeperPersistenceEngine(conf: SparkConf, val serializer: Serializer) extends PersistenceEngine with Logging { private val WORKING_DIR = conf.get("spark.deploy.zookeeper.dir", "/spark") + "/master_status" private val zk: CuratorFramework = SparkCuratorUtil.newClient(conf) SparkCuratorUtil.mkdir(zk, WORKING_DIR) override def persist(name: String, obj: Object): Unit = { serializeIntoFile(WORKING_DIR + "/" + name, obj) } override def unpersist(name: String): Unit = { zk.delete().forPath(WORKING_DIR + "/" + name) } override def read[T: ClassTag](prefix: String): Seq[T] = { zk.getChildren.forPath(WORKING_DIR).asScala .filter(_.startsWith(prefix)).flatMap(deserializeFromFile[T]) } override def close() { zk.close() } private def serializeIntoFile(path: String, value: AnyRef) { val serialized = serializer.newInstance().serialize(value) val bytes = new Array[Byte](serialized.remaining()) serialized.get(bytes) zk.create().withMode(CreateMode.PERSISTENT).forPath(path, bytes) } private def deserializeFromFile[T](filename: String)(implicit m: ClassTag[T]): Option[T] = { val fileData = zk.getData().forPath(WORKING_DIR + "/" + filename) try { Some(serializer.newInstance().deserialize[T](ByteBuffer.wrap(fileData))) } catch { case e: Exception => logWarning("Exception while reading persisted file, deleting", e) zk.delete().forPath(WORKING_DIR + "/" + filename) None } } }
Example 55
Source File: ShuffledRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import scala.reflect.ClassTag import org.apache.hadoop.security.UserGroupInformation import org.apache.spark._ import org.apache.spark.annotation.DeveloperApi import org.apache.spark.serializer.Serializer private[spark] class ShuffledRDDPartition(val idx: Int) extends Partition { override val index: Int = idx override def hashCode(): Int = index override def equals(other: Any): Boolean = super.equals(other) } def setMapSideCombine(mapSideCombine: Boolean): ShuffledRDD[K, V, C] = { this.mapSideCombine = mapSideCombine this } override def getDependencies: Seq[Dependency[_]] = { val serializer = userSpecifiedSerializer.getOrElse { val serializerManager = SparkEnv.get(user).serializerManager if (mapSideCombine) { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[C]]) } else { serializerManager.getSerializer(implicitly[ClassTag[K]], implicitly[ClassTag[V]]) } } List(new ShuffleDependency(prev, part, serializer, keyOrdering, aggregator, mapSideCombine)) } override val partitioner = Some(part) override def getPartitions: Array[Partition] = { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } override protected def getPreferredLocations(partition: Partition): Seq[String] = { val tracker = SparkEnv.get(user).mapOutputTracker.asInstanceOf[MapOutputTrackerMaster] val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] tracker.getPreferredLocationsForShuffle(dep, partition.index) } override def compute(split: Partition, context: TaskContext): Iterator[(K, C)] = { val dep = dependencies.head.asInstanceOf[ShuffleDependency[K, V, C]] SparkEnv .get(user) .shuffleManager.getReader(dep.shuffleHandle, split.index, split.index + 1, context) .read() .asInstanceOf[Iterator[(K, C)]] } override def clearDependencies() { super.clearDependencies() prev = null } }