org.apache.spark.storage.ShuffleBlockId Scala Examples
The following examples show how to use org.apache.spark.storage.ShuffleBlockId.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SortShuffleWriter.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{IndexShuffleBlockManager, ShuffleWriter, BaseShuffleHandle} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockManager: IndexShuffleBlockManager, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockManager.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { sorter.stop() sorter = null } } } }
Example 2
Source File: SortShuffleWriter.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.Utils import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockResolver.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() context.taskMetrics.shuffleWriteMetrics.foreach( _.incShuffleWriteTime(System.nanoTime - startTime)) sorter = null } } } } private[spark] object SortShuffleWriter { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { // We cannot bypass sorting if we need to do map-side aggregation. if (dep.mapSideCombine) { require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") false } else { val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) dep.partitioner.numPartitions <= bypassMergeThreshold } } }
Example 3
Source File: SortShuffleWriter.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.Utils import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = context.taskMetrics().shuffleWriteMetrics override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } } private[spark] object SortShuffleWriter { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { // We cannot bypass sorting if we need to do map-side aggregation. if (dep.mapSideCombine) { require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") false } else { val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) dep.partitioner.numPartitions <= bypassMergeThreshold } } }
Example 4
Source File: HashShuffleManagerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import java.io.{File, FileWriter} import scala.language.reflectiveCalls import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.serializer.JavaSerializer import org.apache.spark.shuffle.FileShuffleBlockResolver import org.apache.spark.storage.{ShuffleBlockId, FileSegment} class HashShuffleManagerSuite extends SparkFunSuite with LocalSparkContext { private val testConf = new SparkConf(false) private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) { assert(buffer.isInstanceOf[FileSegmentManagedBuffer]) val segment = buffer.asInstanceOf[FileSegmentManagedBuffer] assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath) assert(expected.offset === segment.getOffset) assert(expected.length === segment.getLength) } test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") { val conf = new SparkConf(false) // reset after EACH object write. This is to ensure that there are bytes appended after // an object is written. So if the codepaths assume writeObject is end of data, this should // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc. conf.set("spark.serializer.objectStreamReset", "1") conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer") conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") sc = new SparkContext("local", "test", conf) val shuffleBlockResolver = SparkEnv.get.shuffleManager.shuffleBlockResolver.asInstanceOf[FileShuffleBlockResolver] val shuffle1 = shuffleBlockResolver.forMapTask(1, 1, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle1.writers) { writer.write("test1", "value") writer.write("test2", "value") } for (writer <- shuffle1.writers) { writer.commitAndClose() } val shuffle1Segment = shuffle1.writers(0).fileSegment() shuffle1.releaseWriters(success = true) val shuffle2 = shuffleBlockResolver.forMapTask(1, 2, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle2.writers) { writer.write("test3", "value") writer.write("test4", "vlue") } for (writer <- shuffle2.writers) { writer.commitAndClose() } val shuffle2Segment = shuffle2.writers(0).fileSegment() shuffle2.releaseWriters(success = true) // Now comes the test : // Write to shuffle 3; and close it, but before registering it, check if the file lengths for // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length // of block based on remaining data in file : which could mess things up when there is // concurrent read and writes happening to the same shuffle group. val shuffle3 = shuffleBlockResolver.forMapTask(1, 3, 1, new JavaSerializer(testConf), new ShuffleWriteMetrics) for (writer <- shuffle3.writers) { writer.write("test3", "value") writer.write("test4", "value") } for (writer <- shuffle3.writers) { writer.commitAndClose() } // check before we register. checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0))) shuffle3.releaseWriters(success = true) checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0))) shuffleBlockResolver.removeShuffle(1) } def writeToFile(file: File, numBytes: Int) { val writer = new FileWriter(file, true) for (i <- 0 until numBytes) writer.write(i) writer.close() } }
Example 5
Source File: SortShuffleWriter.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriter, BaseShuffleHandle} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockResolver.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() context.taskMetrics.shuffleWriteMetrics.foreach( _.incShuffleWriteTime(System.nanoTime - startTime)) sorter = null } } } }
Example 6
Source File: BlockStoreShuffleFetcher.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.util.{Failure, Success, Try} import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId} import org.apache.spark.util.CompletionIterator private[hash] object BlockStoreShuffleFetcher extends Logging { def fetch[T]( shuffleId: Int, reduceId: Int, context: TaskContext, serializer: Serializer) : Iterator[T] = { logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId)) val blockManager = SparkEnv.get.blockManager val startTime = System.currentTimeMillis val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId) logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format( shuffleId, reduceId, System.currentTimeMillis - startTime)) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]] for (((address, size), index) <- statuses.zipWithIndex) { splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size)) } val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map { case (address, splits) => (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2))) } def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = { val blockId = blockPair._1 val blockOption = blockPair._2 blockOption match { case Success(block) => { block.asInstanceOf[Iterator[T]] } case Failure(e) => { blockId match { case ShuffleBlockId(shufId, mapId, _) => val address = statuses(mapId.toInt)._1 throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) } } } } val blockFetcherItr = new ShuffleBlockFetcherIterator( context, SparkEnv.get.blockManager.shuffleClient, blockManager, blocksByAddress, serializer, // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024) val itr = blockFetcherItr.flatMap(unpackBlock) val completionIter = CompletionIterator[T, Iterator[T]](itr, { context.taskMetrics.updateShuffleReadMetrics() }) new InterruptibleIterator[T](context, completionIter) { val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() override def next(): T = { readMetrics.incRecordsRead(1) delegate.next() } } } }
Example 7
Source File: SortShuffleWriter.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.Utils import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val user = Utils.getCurrentUserName private val dep = handle.dependency private val blockManager = SparkEnv.get(user).blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = context.taskMetrics().shuffleWriteMetrics override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } } private[spark] object SortShuffleWriter { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { // We cannot bypass sorting if we need to do map-side aggregation. if (dep.mapSideCombine) { require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") false } else { val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) dep.partitioner.numPartitions <= bypassMergeThreshold } } }
Example 8
Source File: HashShuffleManagerSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import java.io.{File, FileWriter} import scala.language.reflectiveCalls import org.scalatest.FunSuite import org.apache.spark.{SparkEnv, SparkContext, LocalSparkContext, SparkConf} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.serializer.JavaSerializer import org.apache.spark.shuffle.FileShuffleBlockManager import org.apache.spark.storage.{ShuffleBlockId, FileSegment} class HashShuffleManagerSuite extends FunSuite with LocalSparkContext { private val testConf = new SparkConf(false) private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) { assert(buffer.isInstanceOf[FileSegmentManagedBuffer]) val segment = buffer.asInstanceOf[FileSegmentManagedBuffer] assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath) assert(expected.offset === segment.getOffset) assert(expected.length === segment.getLength) } test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") { val conf = new SparkConf(false) // reset after EACH object write. This is to ensure that there are bytes appended after // an object is written. So if the codepaths assume writeObject is end of data, this should // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc. conf.set("spark.serializer.objectStreamReset", "1") conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer") conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") sc = new SparkContext("local", "test", conf) val shuffleBlockManager = SparkEnv.get.shuffleManager.shuffleBlockManager.asInstanceOf[FileShuffleBlockManager] val shuffle1 = shuffleBlockManager.forMapTask(1, 1, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle1.writers) { writer.write("test1") writer.write("test2") } for (writer <- shuffle1.writers) { writer.commitAndClose() } val shuffle1Segment = shuffle1.writers(0).fileSegment() shuffle1.releaseWriters(success = true) val shuffle2 = shuffleBlockManager.forMapTask(1, 2, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle2.writers) { writer.write("test3") writer.write("test4") } for (writer <- shuffle2.writers) { writer.commitAndClose() } val shuffle2Segment = shuffle2.writers(0).fileSegment() shuffle2.releaseWriters(success = true) // Now comes the test : // Write to shuffle 3; and close it, but before registering it, check if the file lengths for // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length // of block based on remaining data in file : which could mess things up when there is concurrent read // and writes happening to the same shuffle group. val shuffle3 = shuffleBlockManager.forMapTask(1, 3, 1, new JavaSerializer(testConf), new ShuffleWriteMetrics) for (writer <- shuffle3.writers) { writer.write("test3") writer.write("test4") } for (writer <- shuffle3.writers) { writer.commitAndClose() } // check before we register. checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0))) shuffle3.releaseWriters(success = true) checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0))) shuffleBlockManager.removeShuffle(1) } def writeToFile(file: File, numBytes: Int) { val writer = new FileWriter(file, true) for (i <- 0 until numBytes) writer.write(i) writer.close() } }
Example 9
Source File: NettyBlockTransferSecuritySuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio._ import java.util.concurrent.TimeUnit import scala.concurrent.duration._ import scala.concurrent.{Await, Promise} import scala.util.{Failure, Success, Try} import org.apache.commons.io.IOUtils import org.apache.spark.network.buffer.{ManagedBuffer, NioManagedBuffer} import org.apache.spark.network.shuffle.BlockFetchingListener import org.apache.spark.network.{BlockDataManager, BlockTransferService} import org.apache.spark.storage.{BlockId, ShuffleBlockId} import org.apache.spark.{SecurityManager, SparkConf} import org.mockito.Mockito._ import org.scalatest.mock.MockitoSugar import org.scalatest.{BeforeAndAfterAll, BeforeAndAfterEach, FunSuite, ShouldMatchers} class NettyBlockTransferSecuritySuite extends FunSuite with MockitoSugar with ShouldMatchers { test("security default off") { val conf = new SparkConf() .set("spark.app.id", "app-id") testConnection(conf, conf) match { case Success(_) => // expected case Failure(t) => fail(t) } } test("security on same password") { val conf = new SparkConf() .set("spark.authenticate", "true") .set("spark.authenticate.secret", "good") .set("spark.app.id", "app-id") testConnection(conf, conf) match { case Success(_) => // expected case Failure(t) => fail(t) } } test("security on mismatch password") { val conf0 = new SparkConf() .set("spark.authenticate", "true") .set("spark.authenticate.secret", "good") .set("spark.app.id", "app-id") val conf1 = conf0.clone.set("spark.authenticate.secret", "bad") testConnection(conf0, conf1) match { case Success(_) => fail("Should have failed") case Failure(t) => t.getMessage should include ("Mismatched response") } } test("security mismatch auth off on server") { val conf0 = new SparkConf() .set("spark.authenticate", "true") .set("spark.authenticate.secret", "good") .set("spark.app.id", "app-id") val conf1 = conf0.clone.set("spark.authenticate", "false") testConnection(conf0, conf1) match { case Success(_) => fail("Should have failed") case Failure(t) => // any funny error may occur, sever will interpret SASL token as RPC } } test("security mismatch auth off on client") { val conf0 = new SparkConf() .set("spark.authenticate", "false") .set("spark.authenticate.secret", "good") .set("spark.app.id", "app-id") val conf1 = conf0.clone.set("spark.authenticate", "true") testConnection(conf0, conf1) match { case Success(_) => fail("Should have failed") case Failure(t) => t.getMessage should include ("Expected SaslMessage") } } private def fetchBlock( self: BlockTransferService, from: BlockTransferService, execId: String, blockId: BlockId): Try[ManagedBuffer] = { val promise = Promise[ManagedBuffer]() self.fetchBlocks(from.hostName, from.port, execId, Array(blockId.toString), new BlockFetchingListener { override def onBlockFetchFailure(blockId: String, exception: Throwable): Unit = { promise.failure(exception) } override def onBlockFetchSuccess(blockId: String, data: ManagedBuffer): Unit = { promise.success(data.retain()) } }) Await.ready(promise.future, FiniteDuration(1000, TimeUnit.MILLISECONDS)) promise.future.value.get } }
Example 10
Source File: SortShuffleWriter.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.Utils import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = context.taskMetrics().shuffleWriteMetrics override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } } private[spark] object SortShuffleWriter { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { // We cannot bypass sorting if we need to do map-side aggregation. if (dep.mapSideCombine) { require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") false } else { val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) dep.partitioner.numPartitions <= bypassMergeThreshold } } }
Example 11
Source File: BlockStoreShuffleFetcher.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.HashMap import scala.util.{Failure, Success, Try} import org.apache.spark._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.FetchFailedException import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId} import org.apache.spark.util.CompletionIterator private[hash] object BlockStoreShuffleFetcher extends Logging { def fetch[T]( shuffleId: Int, reduceId: Int, context: TaskContext, serializer: Serializer) : Iterator[T] = { logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId)) val blockManager = SparkEnv.get.blockManager val startTime = System.currentTimeMillis val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId) logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format( shuffleId, reduceId, System.currentTimeMillis - startTime)) val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]] for (((address, size), index) <- statuses.zipWithIndex) { splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size)) } val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map { case (address, splits) => (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2))) } def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = { val blockId = blockPair._1 val blockOption = blockPair._2 blockOption match { case Success(block) => { block.asInstanceOf[Iterator[T]] } case Failure(e) => { blockId match { case ShuffleBlockId(shufId, mapId, _) => val address = statuses(mapId.toInt)._1 throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e) case _ => throw new SparkException( "Failed to get block " + blockId + ", which is not a shuffle block", e) } } } } val blockFetcherItr = new ShuffleBlockFetcherIterator( context, SparkEnv.get.blockManager.shuffleClient, blockManager, blocksByAddress, serializer, SparkEnv.get.conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024) val itr = blockFetcherItr.flatMap(unpackBlock) val completionIter = CompletionIterator[T, Iterator[T]](itr, { context.taskMetrics.updateShuffleReadMetrics() }) new InterruptibleIterator[T](context, completionIter) { val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency() override def next(): T = { readMetrics.incRecordsRead(1) delegate.next() } } } }
Example 12
Source File: SortShuffleWriter.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.Utils import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = context.taskMetrics().shuffleWriteMetrics override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } } private[spark] object SortShuffleWriter { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { // We cannot bypass sorting if we need to do map-side aggregation. if (dep.mapSideCombine) { require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") false } else { val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) dep.partitioner.numPartitions <= bypassMergeThreshold } } }
Example 13
Source File: RemoteShuffleWriter.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.remote import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.collection.RemoteSorter private[spark] class RemoteShuffleWriter[K, V, C]( resolver: RemoteShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Long, context: TaskContext) extends ShuffleWriter[K, V] with Logging { logWarning("******** General Remote Shuffle Writer is used ********") private lazy val fs = RemoteShuffleManager.getFileSystem private val blockManager = SparkEnv.get.blockManager private val dep = handle.dependency private var sorter: RemoteSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = context.taskMetrics().shuffleWriteMetrics override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } }
Example 14
Source File: MyNettyBlockRpcServer.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.network.netty import java.nio.ByteBuffer import scala.language.existentials import org.apache.spark.SparkEnv import org.apache.spark.internal.Logging import org.apache.spark.network.BlockDataManager import org.apache.spark.network.client.{RpcResponseCallback, StreamCallbackWithID, TransportClient} import org.apache.spark.network.server.{OneForOneStreamManager, RpcHandler, StreamManager} import org.apache.spark.network.shuffle.protocol._ import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle.remote.{HadoopFileSegmentManagedBuffer, MessageForHadoopManagedBuffers, RemoteShuffleManager} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.storage.{BlockId, ShuffleBlockId} class MyNettyBlockRpcServer( appId: String, serializer: Serializer, blockManager: BlockDataManager) extends RpcHandler with Logging { private val streamManager = new OneForOneStreamManager() override def receive( client: TransportClient, rpcMessage: ByteBuffer, responseContext: RpcResponseCallback): Unit = { val message = BlockTransferMessage.Decoder.fromByteBuffer(rpcMessage) logTrace(s"Received request: $message") message match { case openBlocks: OpenBlocks => val blocksNum = openBlocks.blockIds.length val isShuffleRequest = (blocksNum > 0) && BlockId.apply(openBlocks.blockIds(0)).isInstanceOf[ShuffleBlockId] && (SparkEnv.get.conf.get("spark.shuffle.manager", classOf[SortShuffleManager].getName) == classOf[RemoteShuffleManager].getName) if (isShuffleRequest) { val blockIdAndManagedBufferPair = openBlocks.blockIds.map(block => (block, blockManager.getHostLocalShuffleData( BlockId.apply(block), Array.empty).asInstanceOf[HadoopFileSegmentManagedBuffer])) responseContext.onSuccess(new MessageForHadoopManagedBuffers( blockIdAndManagedBufferPair).toByteBuffer.nioBuffer()) } else { // This customized Netty RPC server is only served for RemoteShuffle requests, // Other RPC messages or data chunks transferring should go through // NettyBlockTransferService' NettyBlockRpcServer throw new UnsupportedOperationException("MyNettyBlockRpcServer only serves remote" + " shuffle requests for OpenBlocks") } case uploadBlock: UploadBlock => throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't serve UploadBlock") } } override def receiveStream( client: TransportClient, messageHeader: ByteBuffer, responseContext: RpcResponseCallback): StreamCallbackWithID = { throw new UnsupportedOperationException("MyNettyBlockRpcServer doesn't support receiving" + " stream") } override def getStreamManager(): StreamManager = streamManager }
Example 15
Source File: SplashShuffleFetcherIteratorTest.scala From splash with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle import com.memverge.splash.StorageFactoryHolder import org.apache.spark.executor.TaskMetrics import org.apache.spark.storage.{ShuffleBlockId, TestBlockId} import org.assertj.core.api.Assertions.assertThat import org.assertj.core.api.Fail.fail import org.testng.annotations.{AfterMethod, BeforeMethod, Test} @Test(groups = Array("UnitTest", "IntegrationTest")) class SplashShuffleFetcherIteratorTest { private val appId = "SplashShuffleFetcherIteratorTest" private val factory = StorageFactoryHolder.getFactory private var resolver: SplashShuffleBlockResolver = _ @BeforeMethod private def beforeMethod(): Unit = { resolver = new SplashShuffleBlockResolver(appId) } @AfterMethod private def afterMethod(): Unit = { factory.reset() assertThat(factory.getShuffleFileCount(appId)) isEqualTo 0 assertThat(factory.getTmpFileCount) isEqualTo 0 } def testNext(): Unit = { val blocks = List( resolver.putShuffleBlock(2, 1, Array(10L, 20L, 30L)), resolver.putShuffleBlock(2, 2, Array(30L, 15L, 22L))) val fetchers = SplashShuffleFetcherIterator(resolver, blocks.iterator) assertThat(fetchers.hasNext).isTrue val fetcher1 = fetchers.next() assertThat(fetcher1.blockId) isEqualTo ShuffleBlockId(2, 1, 0) assertThat(fetcher1.length) isEqualTo 10 fetcher1.close() val fetcher2 = fetchers.next() assertThat(fetcher2.blockId) isEqualTo ShuffleBlockId(2, 2, 0) assertThat(fetcher2.length) isEqualTo 30 fetcher2.close() } def testDumpOnError(): Unit = { val serializer = TestUtil.kryoSerializer val blocks = List( resolver.putShuffleBlock(3, 1, Array(10L, 20L, 30L)), resolver.putShuffleBlock(3, 2, Array(30L, 15L, 22L))) val fetchers = SplashShuffleFetcherIterator(resolver, blocks.iterator) val iterator = fetchers.flatMap( fetcher => fetcher.asMetricIterator(serializer, TaskMetrics.empty)) try { iterator.next() fail("should have raised an exception.") } catch { case _: Exception => val path = resolver.getDumpFilePath(ShuffleBlockId(3, 2, 0)) assertThat(path.toFile.exists()).isTrue } } def testNoNextValue(): Unit = { val blocks = List(TestBlockId("block-1")) val fetchers = SplashShuffleFetcherIterator(resolver, blocks.iterator) assertThat(fetchers.hasNext).isFalse } def testSkipNonShuffleBlocks(): Unit = { val blocks = List( TestBlockId("block-1"), TestBlockId("block-2"), resolver.putShuffleBlock(4, 2, Array(30L, 15L, 22L))) val fetchers = SplashShuffleFetcherIterator(resolver, blocks.iterator).toArray assertThat(fetchers.length) isEqualTo 1 fetchers.foreach(_.close()) } }
Example 16
Source File: SplashShuffleWriter.scala From splash with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle import org.apache.spark.TaskContext import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage.ShuffleBlockId override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { None } else { stopping = true if (success) { Option(MapStatus(resolver.blockManagerId, partitionLengths)) } else { None } } } finally { if (sorter != null) { val startTime = System.nanoTime sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } }
Example 17
Source File: FutureTaskWaiter.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashSet import org.apache.spark.internal.Logging import org.apache.spark.MapOutputTracker import org.apache.spark.SparkConf import org.apache.spark.storage.BlockManager import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.TimeStampedHashMap private[spark] case class FutureTaskInfo(shuffleId: Int, numMaps: Int, reduceId: Int, taskId: Long, nonZeroPartitions: Option[Array[Int]], taskCb: () => Unit) private[spark] class FutureTaskWaiter( conf: SparkConf, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends Logging { // Key is (shuffleId, reduceId) private val futureTaskInfo = new TimeStampedHashMap[(Int, Int), FutureTaskInfo] // Key is (shuffleId, reduceId), value is the set of blockIds we are waiting for private val futureTasksBlockWait = new TimeStampedHashMap[(Int, Int), HashSet[Int]] def submitFutureTask(info: FutureTaskInfo) { futureTasksBlockWait.synchronized { val blocksToWaitFor = if (info.nonZeroPartitions.isDefined) { info.nonZeroPartitions.get.toSet } else { (0 until info.numMaps).toArray.toSet } // Check if all the blocks already exist. If so just trigger taskCb // Count how many outputs have been registered with the MapOutputTracker for this shuffle // and intersect with blocksToWaitFor to only get how many for this reduce are available val availableBlocks = mapOutputTracker.getAvailableMapOutputs(info.shuffleId).intersect(blocksToWaitFor) val mapsToWait = blocksToWaitFor.size val numMapsPending = blocksToWaitFor.size - availableBlocks.size if (availableBlocks.size >= mapsToWait) { info.taskCb() } else { futureTaskInfo.put((info.shuffleId, info.reduceId), info) // NOTE: Its fine not to synchronize here as two future tasks shouldn't be submitted at the // same time Calculate the number of blocks to wait for before starting future task val waitForBlocks = blocksToWaitFor.diff(availableBlocks) futureTasksBlockWait.put( (info.shuffleId, info.reduceId), new HashSet[Int]() ++ waitForBlocks) } } } def shuffleBlockReady(shuffleBlockId: ShuffleBlockId): Unit = { val key = (shuffleBlockId.shuffleId, shuffleBlockId.reduceId) futureTasksBlockWait.synchronized { if (futureTaskInfo.contains(key)) { if (futureTasksBlockWait.contains(key)) { futureTasksBlockWait(key) -= shuffleBlockId.mapId // If we have all the blocks, run the CB if (futureTasksBlockWait(key).size <= 0) { val cb = futureTaskInfo(key).taskCb futureTasksBlockWait.remove(key) futureTaskInfo.remove(key) cb() } } } } } def addMapStatusAvailable(shuffleId: Int, mapId: Int, numReduces: Int, mapStatus: MapStatus) { // NOTE: This should be done before we trigger future tasks. mapOutputTracker.addStatus(shuffleId, mapId, mapStatus) futureTasksBlockWait.synchronized { // Register the output for each reduce task. (0 until numReduces).foreach { reduceId => shuffleBlockReady(new ShuffleBlockId(shuffleId, mapId, reduceId)) } } } }
Example 18
Source File: FutureTaskNotifier.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockManagerId import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.storage.StorageLevel private[spark] object FutureTaskNotifier extends Logging { def taskCompleted( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], shuffleWriteMetrics: ShuffleWriteMetrics, skipZeroByteNotifications: Boolean): Unit = { if (!nextStageLocs.isEmpty && numReduces == nextStageLocs.get.length) { val drizzleRpcsStart = System.nanoTime sendMapStatusToNextTaskLocations(status, mapId, shuffleId, numReduces, nextStageLocs, skipZeroByteNotifications) shuffleWriteMetrics.incWriteTime(System.nanoTime - drizzleRpcsStart) } else { logInfo( s"No taskCompletion next: ${nextStageLocs.map(_.length).getOrElse(0)} r: $numReduces") } } // Push metadata saying that this map task finished, so that the tasks in the next stage // know they can begin pulling the data. private def sendMapStatusToNextTaskLocations( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], skipZeroByteNotifications: Boolean) { val numReduces = nextStageLocs.get.length val uniqueLocations = if (skipZeroByteNotifications) { nextStageLocs.get.zipWithIndex.filter { x => status.getSizeForBlock(x._2) != 0L }.map(_._1).toSet } else { nextStageLocs.get.toSet } uniqueLocations.foreach { blockManagerId => try { SparkEnv.get.blockManager.blockTransferService.mapOutputReady( blockManagerId.host, blockManagerId.port, shuffleId, mapId, numReduces, status) } catch { case e: Exception => logWarning(s"Failed to send map outputs to $blockManagerId", e) } } } }