org.apache.spark.rdd.ShuffledRDD Scala Examples
The following examples show how to use org.apache.spark.rdd.ShuffledRDD.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: HBaseShuffledRDD.scala From Heracles with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.spark._ import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition} import scala.annotation.meta.param class HBaseShuffledRDD ( prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])], part: Partitioner, @(transient @param) hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){ override def getPartitions: Array[Partition] = { if (hbPartitions==null || hbPartitions.isEmpty) { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } else { // only to be invoked by clients hbPartitions.toArray } } override def getPreferredLocations(split: Partition): Seq[String] = { if (hbPartitions==null || hbPartitions.isEmpty) { Seq.empty } else { split.asInstanceOf[HBasePartition].server.map { identity[String] }.toSeq } } }
Example 2
Source File: MapDPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object MapDPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply[T](origin: RDD[(Int, (T, InternalRow))], num_partitions: Int): RDD[(Int, (T, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Int, (T, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new MapDPartitioner(num_partitions) new ShuffledRDD[Int, (T, InternalRow), (T, InternalRow)](rdd, part) } } class MapDPartitioner(num_partitions: Int) extends Partitioner { def numPartitions: Int = num_partitions def getPartition(key: Any): Int = { val k = key.asInstanceOf[Int] require(k >= 0 && k < num_partitions) k } }
Example 3
Source File: RangeDPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.util.CollectionsUtils import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair import scala.reflect.ClassTag object RangeDPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply[K: Ordering: ClassTag, T](origin: RDD[(K, (T, InternalRow))], range_bounds: Array[K]): RDD[(K, (T, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[K, (T, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new RangeDPartitioner(range_bounds, ascending = true) new ShuffledRDD[K, (T, InternalRow), (T, InternalRow)](rdd, part) } } class RangeDPartitioner[K: Ordering: ClassTag](range_bounds: Array[K], ascending: Boolean) extends Partitioner { def numPartitions: Int = range_bounds.length + 1 private val binarySearch: ((Array[K], K) => Int) = CollectionsUtils.makeBinarySearch[K] def getPartition(key: Any): Int = { val k = key.asInstanceOf[K] var partition = 0 if (range_bounds.length < 128) { while (partition < range_bounds.length && Ordering[K].gt(k, range_bounds(partition))) partition += 1 } else { partition = binarySearch(range_bounds, k) if (partition < 0) partition = -partition - 1 if (partition > range_bounds.length) partition = range_bounds.length } if (ascending) partition else range_bounds.length - partition } }
Example 4
Source File: VoronoiPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.sql.simba.spatial.Point import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object VoronoiPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply(origin: RDD[(Int, (Point, InternalRow))], pivot_to_group: Array[Int], num_group: Int) : RDD[(Int, (Point, InternalRow))] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, (row._2._1, row._2._2.copy())))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Int, (Point, InternalRow)]() iter.map(row => mutablePair.update(row._1, (row._2._1, row._2._2.copy()))) } } val part = new VoronoiPartitioner(pivot_to_group, num_group) new ShuffledRDD[Int, (Point, InternalRow), (Point, InternalRow)](rdd, part) } } class VoronoiPartitioner(pivot_to_group: Array[Int], num_group: Int) extends Partitioner { override def numPartitions: Int = num_group override def getPartition(key: Any): Int = { val k = key.asInstanceOf[Int] pivot_to_group(k) } }
Example 5
Source File: HashPartitioner.scala From Simba with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.simba.partitioner import org.apache.spark.{Partitioner, SparkEnv} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.util.MutablePair object HashPartition { def sortBasedShuffleOn: Boolean = SparkEnv.get.shuffleManager.isInstanceOf[SortShuffleManager] def apply(origin: RDD[(Any, InternalRow)], num_partitions: Int): RDD[(Any, InternalRow)] = { val rdd = if (sortBasedShuffleOn) { origin.mapPartitions {iter => iter.map(row => (row._1, row._2.copy()))} } else { origin.mapPartitions {iter => val mutablePair = new MutablePair[Any, InternalRow]() iter.map(row => mutablePair.update(row._1, row._2.copy())) } } val part = new HashPartitioner(num_partitions) new ShuffledRDD[Any, InternalRow, InternalRow](rdd, part) } } class HashPartitioner(num_partitions: Int) extends Partitioner { override def numPartitions: Int = num_partitions override def getPartition(key: Any): Int = { key.hashCode() % num_partitions } }
Example 6
Source File: IDPartitioner.scala From traj-sim-spark with Apache License 2.0 | 5 votes |
package edu.utah.cs.partitioner import org.apache.spark.Partitioner import org.apache.spark.rdd.{RDD, ShuffledRDD} object IDPartition { def apply(origin: RDD[_ <: Product2[Int, Any]], n_part: Int) : RDD[_ <: Product2[Int, Any]] = { val part = new IDPartitioner(n_part) val shuffled = new ShuffledRDD[Int, Any, Any](origin, part) shuffled } } class IDPartitioner(n_part: Int) extends Partitioner { override def numPartitions: Int = n_part override def getPartition(key: Any): Int = { key.asInstanceOf[Int] } }
Example 7
Source File: SortShuffleSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 8
Source File: SortShuffleSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() // Once 'spark.local.dir' is set, it is cached. Unless this is manually cleared // before/after a test, it could return the same directory even if this property // is configured. Utils.clearLocalRootDirs() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) Utils.clearLocalRootDirs() } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 9
Source File: HBaseShuffledRDD.scala From Spark-SQL-on-HBase with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.spark._ import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition} class HBaseShuffledRDD ( prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])], part: Partitioner, @transient hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){ override def getPartitions: Array[Partition] = { if (hbPartitions==null || hbPartitions.isEmpty) { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } else { // only to be invoked by clients hbPartitions.toArray } } override def getPreferredLocations(split: Partition): Seq[String] = { if (hbPartitions==null || hbPartitions.isEmpty) { Seq.empty } else { split.asInstanceOf[HBasePartition].server.map { identity[String] }.toSeq } } }
Example 10
Source File: SortShuffleSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 11
Source File: SortShuffleSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 12
Source File: SortShuffleSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 13
Source File: RoutingTablePartition.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.impl import scala.reflect.ClassTag import org.apache.spark.Partitioner import org.apache.spark.rdd.RDD import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.util.collection.{BitSet, PrimitiveVector} import org.apache.spark.graphx._ import org.apache.spark.graphx.util.collection.GraphXPrimitiveKeyOpenHashMap import org.apache.spark.graphx.impl.RoutingTablePartition.RoutingTableMessage private[graphx] object RoutingTablePartition { def foreachWithinEdgePartition (pid: PartitionID, includeSrc: Boolean, includeDst: Boolean) (f: VertexId => Unit) { val (vidsCandidate, srcVids, dstVids) = routingTable(pid) val size = vidsCandidate.length if (includeSrc && includeDst) { // Avoid checks for performance vidsCandidate.iterator.foreach(f) } else if (!includeSrc && !includeDst) { // Do nothing } else { val relevantVids = if (includeSrc) srcVids else dstVids relevantVids.iterator.foreach { i => f(vidsCandidate(i)) } } } }
Example 14
Source File: HBaseShuffledRDD.scala From Backup-Repo with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.hbase import org.apache.spark._ import org.apache.spark.rdd.{RDD, ShuffledRDD, ShuffledRDDPartition} class HBaseShuffledRDD ( prevRdd: RDD[(HBaseRawType, Array[HBaseRawType])], part: Partitioner, @transient hbPartitions: Seq[HBasePartition] = Nil) extends ShuffledRDD(prevRdd, part){ override def getPartitions: Array[Partition] = { if (hbPartitions==null || hbPartitions.isEmpty) { Array.tabulate[Partition](part.numPartitions)(i => new ShuffledRDDPartition(i)) } else { // only to be invoked by clients hbPartitions.toArray } } override def getPreferredLocations(split: Partition): Seq[String] = { if (hbPartitions==null || hbPartitions.isEmpty) { Seq.empty } else { split.asInstanceOf[HBasePartition].server.map { identity[String] }.toSeq } } }
Example 15
Source File: ColumnPartitioner.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.partitioner import org.apache.spark.{Partitioner, SparkConf} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.matfast.util.MatfastSerializer // scalastyle:off class ColumnPartitioner(partitions: Int) extends Partitioner{ require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions") override val numPartitions = partitions override def getPartition(key: Any): Int = { key match { case (i: Int, j: Int) => j % partitions case (i: Int, j: Int, _: Int) => j % partitions case _ => throw new IllegalArgumentException(s"Unrecognized key: $key") } } override def equals(other: Any): Boolean = { other.isInstanceOf[ColumnPartitioner] && numPartitions == other.asInstanceOf[ColumnPartitioner].numPartitions } override def hashCode(): Int = { com.google.common.base.Objects.hashCode(partitions: java.lang.Integer) } } // scalastyle:on object ColumnPartitioner { def apply(origin: RDD[InternalRow], numPartitions: Int): RDD[((Int, Int), InternalRow)] = { val rdd = origin.map { row => val rid = row.getInt(0) val cid = row.getInt(1) val matrix = row.getStruct(2, 7) ((rid, cid), matrix) } val partitioner = new ColumnPartitioner(numPartitions) val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner) shuffled.setSerializer(new MatfastSerializer(new SparkConf(false))) shuffled } }
Example 16
Source File: BlockCyclicPartitioner.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.partitioner import org.apache.spark.{Partitioner, SparkConf} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.matfast.util.MatfastSerializer class BlockCyclicPartitioner(val ROW_BLKS: Int, val COL_BLKS: Int, val ROW_BLKS_PER_PARTITION: Int, val COL_BLKS_PER_PARTITION: Int) extends Partitioner{ require(ROW_BLKS > 0, s"Number of row blocks should be larger than 0, but found $ROW_BLKS") require(COL_BLKS > 0, s"Number of col blocks should be larger than 0, but found $COL_BLKS") require(ROW_BLKS_PER_PARTITION > 0, s"Number of row blocks per partition should be larger than 0, " + s"but found $ROW_BLKS_PER_PARTITION") require(COL_BLKS_PER_PARTITION > 0, s"Number of col blocks per partition should be larger than 0, " + s"but found $COL_BLKS_PER_PARTITION") private val row_partition_num = math.ceil(ROW_BLKS * 1.0 / ROW_BLKS_PER_PARTITION).toInt private val col_partition_num = math.ceil(COL_BLKS * 1.0 / COL_BLKS_PER_PARTITION).toInt private val num_row_part = ROW_BLKS / row_partition_num private val num_col_part = COL_BLKS / col_partition_num override val numPartitions: Int = row_partition_num * col_partition_num override def getPartition(key: Any): Int = { key match { case (i: Int, j : Int) => ((i % num_row_part) * col_partition_num + (j % num_col_part)) % numPartitions case (i: Int, j: Int, _: Int) => ((i % num_row_part) * col_partition_num + (j % num_col_part)) % numPartitions case _ => throw new IllegalArgumentException(s"Unrecognized key: $key") } } override def equals(obj: Any): Boolean = { obj match { case r: BlockCyclicPartitioner => (ROW_BLKS == r.ROW_BLKS) && (COL_BLKS == r.COL_BLKS) && (ROW_BLKS_PER_PARTITION == r.ROW_BLKS_PER_PARTITION) && (COL_BLKS_PER_PARTITION == r.COL_BLKS_PER_PARTITION) case _ => false } } override def hashCode(): Int = { com.google.common.base.Objects.hashCode( ROW_BLKS: java.lang.Integer, COL_BLKS: java.lang.Integer, ROW_BLKS_PER_PARTITION: java.lang.Integer, COL_BLKS_PER_PARTITION: java.lang.Integer ) } } object BlockCyclicPartitioner { def apply(origin: RDD[InternalRow], ROW_BLKS: Int, COL_BLKS: Int, ROW_BLKS_PER_PARTITION: Int, COL_BLKS_PER_PARTITION: Int): RDD[((Int, Int), InternalRow)] = { val rdd = origin.map { row => val rid = row.getInt(0) val cid = row.getInt(1) val matrix = row.getStruct(2, 7) ((rid, cid), matrix) } val partitioner = new BlockCyclicPartitioner(ROW_BLKS, COL_BLKS, ROW_BLKS_PER_PARTITION, COL_BLKS_PER_PARTITION) val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner) shuffled.setSerializer(new MatfastSerializer(new SparkConf(false))) shuffled } }
Example 17
Source File: RowPartitioner.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.partitioner import org.apache.spark.{Partitioner, SparkConf} import org.apache.spark.rdd.{RDD, ShuffledRDD} import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.matfast.util.MatfastSerializer // scalastyle:off class RowPartitioner(partitions: Int) extends Partitioner{ require(partitions >= 0, s"Number of partitions cannot be negative but found $partitions") override val numPartitions = partitions override def getPartition(key: Any): Int = { key match { case (i: Int, j: Int) => i % partitions case (i: Int, j: Int, _: Int) => i % partitions case _ => throw new IllegalArgumentException(s"Unrecognized key: $key") } } override def equals(other: Any): Boolean = { other.isInstanceOf[RowPartitioner] && numPartitions == other.asInstanceOf[RowPartitioner].numPartitions } override def hashCode(): Int = { com.google.common.base.Objects.hashCode(partitions: java.lang.Integer) } } object RowPartitioner { def apply(origin: RDD[InternalRow], numPartitions: Int): RDD[((Int, Int), InternalRow)] = { val rdd = origin.map { row => val rid = row.getInt(0) val cid = row.getInt(1) val matrix = row.getStruct(2, 7) ((rid, cid), matrix) } val partitioner = new RowPartitioner(numPartitions) val shuffled = new ShuffledRDD[(Int, Int), InternalRow, InternalRow](rdd, partitioner) shuffled.setSerializer(new MatfastSerializer(new SparkConf(false))) shuffled } }