org.apache.spark.shuffle.sort.SortShuffleManager Scala Examples
The following examples show how to use org.apache.spark.shuffle.sort.SortShuffleManager.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SortShuffleSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 2
Source File: MyNettyBlockRpcServer.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.language.existentials import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.client.{RpcResponseCallback, StreamCallbackWithID, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.remote.{HadoopFileSegmentManagedBuffer, MessageForHadoopManagedBuffers, RemoteShuffleManager} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.storage.{BlockId, ShuffleBlockId} class MyNettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length val isShuffleRequest = (blocksNum > 0) && BlockId.apply(openBlocks.blockIds(0)).isInstanceOf[ShuffleBlockId] && (SparkEnv.get.conf.get("spark.shuffle.manager", classOf[SortShuffleManager].getName) == classOf[RemoteShuffleManager].getName) if (isShuffleRequest) { val blockIdAndManagedBufferPair = openBlocks.blockIds.map(block => (block, blockManager.getHostLocalShuffleData( BlockId.apply(block), Array.empty).asInstanceOf[HadoopFileSegmentManagedBuffer])) responseContext.onSuccess(new MessageForHadoopManagedBuffers( blockIdAndManagedBufferPair).toByteBuffer.nioBuffer()) } else { // This customized Netty RPC server is only served for RemoteShuffle requests, // Other RPC messages or data chunks transferring should go through // NettyBlockTransferService' NettyBlockRpcServer throw new UnsupportedOperationException("MyNettyBlockRpcServer only serves remote" + " shuffle requests for OpenBlocks") } case uploadBlock: UploadBlock => throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't serve UploadBlock") } } override def receiveStream( client: TransportClient, messageHeader: ByteBuffer, responseContext: RpcResponseCallback): StreamCallbackWithID = { throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't support receiving" + " stream") } override def getStreamManager(): StreamManager = streamManager }
Example 3
Source File: SortShuffleSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 4
Source File: SortShuffleSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 5
Source File: SortShuffleSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() // Once 'spark.local.dir' is set, it is cached. Unless this is manually cleared // before/after a test, it could return the same directory even if this property // is configured. Utils.clearLocalRootDirs() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) Utils.clearLocalRootDirs() } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 6
Source File: SortShuffleSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 7
Source File: HashPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object HashPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply(origin: RDD[(Any, InternalRow)], num_partitions: Int): RDD[(Any, InternalRow)] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, row._2.copy()))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Any, InternalRow]() iter.map(row => mutablePair.update(row._1, row._2.copy())) } } val part = new HashPartitioner(num_partitions) new ShuffledRDD[Any, InternalRow, InternalRow](rdd, part) } } class HashPartitioner(num_partitions: Int) extends Partitioner { override def numPartitions: Int = num_partitions override def getPartition(key: Any): Int = { key.hashCode() % num_partitions } }
Example 8
Source File: VoronoiPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object VoronoiPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply(origin: RDD[(Int, (Point, InternalRow))], pivot_to_group: Array[Int], num_group: Int) : RDD[(Int, (Point, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Int, (Point, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new VoronoiPartitioner(pivot_to_group, num_group) new ShuffledRDD[Int, (Point, InternalRow), (Point, InternalRow)](rdd, part) } } class VoronoiPartitioner(pivot_to_group: Array[Int], num_group: Int) extends Partitioner { override def numPartitions: Int = num_group override def getPartition(key: Any): Int = { val k = key.asInstanceOf[Int] pivot_to_group(k) } }
Example 9
Source File: RangeDPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.util.CollectionsUtils import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair import scala.reflect.ClassTag object RangeDPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply[K: Ordering: ClassTag, T](origin: RDD[(K, (T, InternalRow))], range_bounds: Array[K]): RDD[(K, (T, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[K, (T, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new RangeDPartitioner(range_bounds, ascending = true) new ShuffledRDD[K, (T, InternalRow), (T, InternalRow)](rdd, part) } } class RangeDPartitioner[K: Ordering: ClassTag](range_bounds: Array[K], ascending: Boolean) extends Partitioner { def numPartitions: Int = range_bounds.length + 1 private val binarySearch: ((Array[K], K) => Int) = CollectionsUtils.makeBinarySearch[K] def getPartition(key: Any): Int = { val k = key.asInstanceOf[K] var partition = 0 if (range_bounds.length < 128) { while (partition < range_bounds.length && Ordering[K].gt(k, range_bounds(partition))) partition += 1 } else { partition = binarySearch(range_bounds, k) if (partition < 0) partition = -partition - 1 if (partition > range_bounds.length) partition = range_bounds.length } if (ascending) partition else range_bounds.length - partition } }
Example 10
Source File: MapDPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object MapDPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply[T](origin: RDD[(Int, (T, InternalRow))], num_partitions: Int): RDD[(Int, (T, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Int, (T, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new MapDPartitioner(num_partitions) new ShuffledRDD[Int, (T, InternalRow), (T, InternalRow)](rdd, part) } } class MapDPartitioner(num_partitions: Int) extends Partitioner { def numPartitions: Int = num_partitions def getPartition(key: Any): Int = { val k = key.asInstanceOf[Int] require(k >= 0 && k < num_partitions) k } }