org.apache.spark.executor.ShuffleWriteMetrics Scala Examples
The following examples show how to use org.apache.spark.executor.ShuffleWriteMetrics.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: FutureTaskNotifier.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.internal.Logging import org.apache.spark.storage.BlockManagerId import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.storage.StorageLevel private[spark] object FutureTaskNotifier extends Logging { def taskCompleted( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], shuffleWriteMetrics: ShuffleWriteMetrics, skipZeroByteNotifications: Boolean): Unit = { if (!nextStageLocs.isEmpty && numReduces == nextStageLocs.get.length) { val drizzleRpcsStart = System.nanoTime sendMapStatusToNextTaskLocations(status, mapId, shuffleId, numReduces, nextStageLocs, skipZeroByteNotifications) shuffleWriteMetrics.incWriteTime(System.nanoTime - drizzleRpcsStart) } else { logInfo( s"No taskCompletion next: ${nextStageLocs.map(_.length).getOrElse(0)} r: $numReduces") } } // Push metadata saying that this map task finished, so that the tasks in the next stage // know they can begin pulling the data. private def sendMapStatusToNextTaskLocations( status: MapStatus, mapId: Int, shuffleId: Int, numReduces: Int, nextStageLocs: Option[Seq[BlockManagerId]], skipZeroByteNotifications: Boolean) { val numReduces = nextStageLocs.get.length val uniqueLocations = if (skipZeroByteNotifications) { nextStageLocs.get.zipWithIndex.filter { x => status.getSizeForBlock(x._2) != 0L }.map(_._1).toSet } else { nextStageLocs.get.toSet } uniqueLocations.foreach { blockManagerId => try { SparkEnv.get.blockManager.blockTransferService.mapOutputReady( blockManagerId.host, blockManagerId.port, shuffleId, mapId, numReduces, status) } catch { case e: Exception => logWarning(s"Failed to send map outputs to $blockManagerId", e) } } } }
Example 2
Source File: SplashShuffleWriter.scala From splash with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle import org.apache.spark.TaskContext import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.internal.Logging import org.apache.spark.scheduler.MapStatus import org.apache.spark.storage.ShuffleBlockId override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { None } else { stopping = true if (success) { Option(MapStatus(resolver.blockManagerId, partitionLengths)) } else { None } } } finally { if (sorter != null) { val startTime = System.nanoTime sorter.stop() writeMetrics.incWriteTime(System.nanoTime - startTime) sorter = null } } } }
Example 3
Source File: RemoteShuffleUtils.scala From OAP with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.remote import java.util.UUID import org.apache.hadoop.fs.Path import org.apache.spark.SparkEnv import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.serializer.{SerializerInstance, SerializerManager} import org.apache.spark.shuffle.ShuffleWriteMetricsReporter import org.apache.spark.storage.{BlockId, TempLocalBlockId, TempShuffleBlockId} object RemoteShuffleUtils { val env = SparkEnv.get def getRemoteWriter( blockId: BlockId, file: Path, serializerManager: SerializerManager, serializerInstance: SerializerInstance, bufferSize: Int, writeMetrics: ShuffleWriteMetricsReporter): RemoteBlockObjectWriter = { val syncWrites = false // env.blockManager.conf.getBoolean("spark.shuffle.sync", false) new RemoteBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize, syncWrites, writeMetrics, blockId) } }
Example 4
Source File: HashShuffleWriter.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle._ import org.apache.spark.storage.BlockObjectWriter private[spark] class HashShuffleWriter[K, V]( shuffleBlockManager: FileShuffleBlockManager, handle: BaseShuffleHandle[K, V, _], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val numOutputSplits = dep.partitioner.numPartitions private val metrics = context.taskMetrics // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private val writeMetrics = new ShuffleWriteMetrics() metrics.shuffleWriteMetrics = Some(writeMetrics) private val blockManager = SparkEnv.get.blockManager private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null)) private val shuffle = shuffleBlockManager.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser, writeMetrics) override def stop(initiallySuccess: Boolean): Option[MapStatus] = { var success = initiallySuccess try { if (stopping) { return None } stopping = true if (success) { try { Some(commitWritesAndBuildStatus()) } catch { case e: Exception => success = false revertWrites() throw e } } else { revertWrites() None } } finally { // Release the writers back to the shuffle block manager. if (shuffle != null && shuffle.writers != null) { try { shuffle.releaseWriters(success) } catch { case e: Exception => logError("Failed to release shuffle writers", e) } } } } private def commitWritesAndBuildStatus(): MapStatus = { // Commit the writes. Get the size of each bucket block (total block size). val sizes: Array[Long] = shuffle.writers.map { writer: BlockObjectWriter => writer.commitAndClose() writer.fileSegment().length } MapStatus(blockManager.shuffleServerId, sizes) } private def revertWrites(): Unit = { if (shuffle != null && shuffle.writers != null) { for (writer <- shuffle.writers) { writer.revertPartialWritesAndClose() } } } }
Example 5
Source File: SortShuffleWriter.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{IndexShuffleBlockManager, ShuffleWriter, BaseShuffleHandle} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockManager: IndexShuffleBlockManager, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockManager.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { sorter.stop() sorter = null } } } }
Example 6
Source File: HashShuffleManagerSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import java.io.{File, FileWriter} import scala.language.reflectiveCalls import org.scalatest.FunSuite import org.apache.spark.{SparkEnv, SparkContext, LocalSparkContext, SparkConf} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.serializer.JavaSerializer import org.apache.spark.shuffle.FileShuffleBlockManager import org.apache.spark.storage.{ShuffleBlockId, FileSegment} class HashShuffleManagerSuite extends FunSuite with LocalSparkContext { private val testConf = new SparkConf(false) private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) { assert(buffer.isInstanceOf[FileSegmentManagedBuffer]) val segment = buffer.asInstanceOf[FileSegmentManagedBuffer] assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath) assert(expected.offset === segment.getOffset) assert(expected.length === segment.getLength) } test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") { val conf = new SparkConf(false) // reset after EACH object write. This is to ensure that there are bytes appended after // an object is written. So if the codepaths assume writeObject is end of data, this should // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc. conf.set("spark.serializer.objectStreamReset", "1") conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer") conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") sc = new SparkContext("local", "test", conf) val shuffleBlockManager = SparkEnv.get.shuffleManager.shuffleBlockManager.asInstanceOf[FileShuffleBlockManager] val shuffle1 = shuffleBlockManager.forMapTask(1, 1, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle1.writers) { writer.write("test1") writer.write("test2") } for (writer <- shuffle1.writers) { writer.commitAndClose() } val shuffle1Segment = shuffle1.writers(0).fileSegment() shuffle1.releaseWriters(success = true) val shuffle2 = shuffleBlockManager.forMapTask(1, 2, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle2.writers) { writer.write("test3") writer.write("test4") } for (writer <- shuffle2.writers) { writer.commitAndClose() } val shuffle2Segment = shuffle2.writers(0).fileSegment() shuffle2.releaseWriters(success = true) // Now comes the test : // Write to shuffle 3; and close it, but before registering it, check if the file lengths for // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length // of block based on remaining data in file : which could mess things up when there is concurrent read // and writes happening to the same shuffle group. val shuffle3 = shuffleBlockManager.forMapTask(1, 3, 1, new JavaSerializer(testConf), new ShuffleWriteMetrics) for (writer <- shuffle3.writers) { writer.write("test3") writer.write("test4") } for (writer <- shuffle3.writers) { writer.commitAndClose() } // check before we register. checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0))) shuffle3.releaseWriters(success = true) checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0))) shuffleBlockManager.removeShuffle(1) } def writeToFile(file: File, numBytes: Int) { val writer = new FileWriter(file, true) for (i <- 0 until numBytes) writer.write(i) writer.close() } }
Example 7
Source File: BlockObjectWriterSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.scalatest.FunSuite import java.io.File import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.serializer.JavaSerializer import org.apache.spark.SparkConf class BlockObjectWriterSuite extends FunSuite { test("verify write metrics") { val file = new File("somefile") file.deleteOnExit() val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics) writer.write(Long.box(20)) // Record metrics update on every write assert(writeMetrics.shuffleRecordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.shuffleBytesWritten == 0) // After 32 writes, metrics should update for (i <- 0 until 32) { writer.flush() writer.write(Long.box(i)) } assert(writeMetrics.shuffleBytesWritten > 0) assert(writeMetrics.shuffleRecordsWritten === 33) writer.commitAndClose() assert(file.length() == writeMetrics.shuffleBytesWritten) } test("verify write metrics on revert") { val file = new File("somefile") file.deleteOnExit() val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics) writer.write(Long.box(20)) // Record metrics update on every write assert(writeMetrics.shuffleRecordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.shuffleBytesWritten == 0) // After 32 writes, metrics should update for (i <- 0 until 32) { writer.flush() writer.write(Long.box(i)) } assert(writeMetrics.shuffleBytesWritten > 0) assert(writeMetrics.shuffleRecordsWritten === 33) writer.revertPartialWritesAndClose() assert(writeMetrics.shuffleBytesWritten == 0) assert(writeMetrics.shuffleRecordsWritten == 0) } test("Reopening a closed block writer") { val file = new File("somefile") file.deleteOnExit() val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics) writer.open() writer.close() intercept[IllegalStateException] { writer.open() } } }
Example 8
Source File: StoragePerfTester.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.tools import java.util.concurrent.{CountDownLatch, Executors} import java.util.concurrent.atomic.AtomicLong import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.shuffle.hash.HashShuffleManager import org.apache.spark.util.Utils val numOutputSplits = sys.env.get("NUM_REDUCERS").map(_.toInt).getOrElse(500) val recordLength = 1000 // ~1KB records val totalRecords = dataSizeMb * 1000 val recordsPerMap = totalRecords / numMaps val writeKey = "1" * (recordLength / 2) val writeValue = "1" * (recordLength / 2) val executor = Executors.newFixedThreadPool(numMaps) val conf = new SparkConf() .set("spark.shuffle.compress", "false") .set("spark.shuffle.sync", "true") .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") // This is only used to instantiate a BlockManager. All thread scheduling is done manually. val sc = new SparkContext("local[4]", "Write Tester", conf) val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager] def writeOutputBytes(mapId: Int, total: AtomicLong): Unit = { val shuffle = hashShuffleManager.shuffleBlockResolver.forMapTask(1, mapId, numOutputSplits, new KryoSerializer(sc.conf), new ShuffleWriteMetrics()) val writers = shuffle.writers for (i <- 1 to recordsPerMap) { writers(i % numOutputSplits).write(writeKey, writeValue) } writers.map { w => w.commitAndClose() total.addAndGet(w.fileSegment().length) } shuffle.releaseWriters(true) } val start = System.currentTimeMillis() val latch = new CountDownLatch(numMaps) val totalBytes = new AtomicLong() for (task <- 1 to numMaps) { executor.submit(new Runnable() { override def run(): Unit = { try { writeOutputBytes(task, totalBytes) latch.countDown() } catch { case e: Exception => println("Exception in child thread: " + e + " " + e.getMessage) System.exit(1) } } }) } latch.await() val end = System.currentTimeMillis() val time = (end - start) / 1000.0 val bytesPerSecond = totalBytes.get() / time val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits)) System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile))) System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong))) executor.shutdown() sc.stop() } }
Example 9
Source File: HashShuffleWriter.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.serializer.Serializer import org.apache.spark.shuffle._ import org.apache.spark.storage.BlockObjectWriter private[spark] class HashShuffleWriter[K, V]( shuffleBlockResolver: FileShuffleBlockResolver, handle: BaseShuffleHandle[K, V, _], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val numOutputSplits = dep.partitioner.numPartitions private val metrics = context.taskMetrics // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private val writeMetrics = new ShuffleWriteMetrics() metrics.shuffleWriteMetrics = Some(writeMetrics) private val blockManager = SparkEnv.get.blockManager private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null)) private val shuffle = shuffleBlockResolver.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser, writeMetrics) override def stop(initiallySuccess: Boolean): Option[MapStatus] = { var success = initiallySuccess try { if (stopping) { return None } stopping = true if (success) { try { Some(commitWritesAndBuildStatus()) } catch { case e: Exception => success = false revertWrites() throw e } } else { revertWrites() None } } finally { // Release the writers back to the shuffle block manager. if (shuffle != null && shuffle.writers != null) { try { shuffle.releaseWriters(success) } catch { case e: Exception => logError("Failed to release shuffle writers", e) } } } } private def commitWritesAndBuildStatus(): MapStatus = { // Commit the writes. Get the size of each bucket block (total block size). val sizes: Array[Long] = shuffle.writers.map { writer: BlockObjectWriter => writer.commitAndClose() writer.fileSegment().length } MapStatus(blockManager.shuffleServerId, sizes) } private def revertWrites(): Unit = { if (shuffle != null && shuffle.writers != null) { for (writer <- shuffle.writers) { writer.revertPartialWritesAndClose() } } } }
Example 10
Source File: SortShuffleWriter.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriter, BaseShuffleHandle} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockResolver.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() context.taskMetrics.shuffleWriteMetrics.foreach( _.incShuffleWriteTime(System.nanoTime - startTime)) sorter = null } } } }
Example 11
Source File: HashShuffleManagerSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.hash import java.io.{File, FileWriter} import scala.language.reflectiveCalls import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer} import org.apache.spark.serializer.JavaSerializer import org.apache.spark.shuffle.FileShuffleBlockResolver import org.apache.spark.storage.{ShuffleBlockId, FileSegment} class HashShuffleManagerSuite extends SparkFunSuite with LocalSparkContext { private val testConf = new SparkConf(false) private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) { assert(buffer.isInstanceOf[FileSegmentManagedBuffer]) val segment = buffer.asInstanceOf[FileSegmentManagedBuffer] assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath) assert(expected.offset === segment.getOffset) assert(expected.length === segment.getLength) } test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") { val conf = new SparkConf(false) // reset after EACH object write. This is to ensure that there are bytes appended after // an object is written. So if the codepaths assume writeObject is end of data, this should // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc. conf.set("spark.serializer.objectStreamReset", "1") conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer") conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") sc = new SparkContext("local", "test", conf) val shuffleBlockResolver = SparkEnv.get.shuffleManager.shuffleBlockResolver.asInstanceOf[FileShuffleBlockResolver] val shuffle1 = shuffleBlockResolver.forMapTask(1, 1, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle1.writers) { writer.write("test1", "value") writer.write("test2", "value") } for (writer <- shuffle1.writers) { writer.commitAndClose() } val shuffle1Segment = shuffle1.writers(0).fileSegment() shuffle1.releaseWriters(success = true) val shuffle2 = shuffleBlockResolver.forMapTask(1, 2, 1, new JavaSerializer(conf), new ShuffleWriteMetrics) for (writer <- shuffle2.writers) { writer.write("test3", "value") writer.write("test4", "vlue") } for (writer <- shuffle2.writers) { writer.commitAndClose() } val shuffle2Segment = shuffle2.writers(0).fileSegment() shuffle2.releaseWriters(success = true) // Now comes the test : // Write to shuffle 3; and close it, but before registering it, check if the file lengths for // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length // of block based on remaining data in file : which could mess things up when there is // concurrent read and writes happening to the same shuffle group. val shuffle3 = shuffleBlockResolver.forMapTask(1, 3, 1, new JavaSerializer(testConf), new ShuffleWriteMetrics) for (writer <- shuffle3.writers) { writer.write("test3", "value") writer.write("test4", "value") } for (writer <- shuffle3.writers) { writer.commitAndClose() } // check before we register. checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0))) shuffle3.releaseWriters(success = true) checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0))) shuffleBlockResolver.removeShuffle(1) } def writeToFile(file: File, numBytes: Int) { val writer = new FileWriter(file, true) for (i <- 0 until numBytes) writer.write(i) writer.close() } }
Example 12
Source File: BlockObjectWriterSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import java.io.File import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.serializer.JavaSerializer import org.apache.spark.util.Utils class BlockObjectWriterSuite extends SparkFunSuite { test("verify write metrics") { val file = new File(Utils.createTempDir(), "somefile") val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) writer.write(Long.box(20), Long.box(30)) // Record metrics update on every write assert(writeMetrics.shuffleRecordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.shuffleBytesWritten == 0) // After 32 writes, metrics should update for (i <- 0 until 32) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.shuffleBytesWritten > 0) assert(writeMetrics.shuffleRecordsWritten === 33) writer.commitAndClose() assert(file.length() == writeMetrics.shuffleBytesWritten) } test("verify write metrics on revert") { val file = new File(Utils.createTempDir(), "somefile") val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) writer.write(Long.box(20), Long.box(30)) // Record metrics update on every write assert(writeMetrics.shuffleRecordsWritten === 1) // Metrics don't update on every write assert(writeMetrics.shuffleBytesWritten == 0) // After 32 writes, metrics should update for (i <- 0 until 32) { writer.flush() writer.write(Long.box(i), Long.box(i)) } assert(writeMetrics.shuffleBytesWritten > 0) assert(writeMetrics.shuffleRecordsWritten === 33) writer.revertPartialWritesAndClose() assert(writeMetrics.shuffleBytesWritten == 0) assert(writeMetrics.shuffleRecordsWritten == 0) } test("Reopening a closed block writer") { val file = new File(Utils.createTempDir(), "somefile") val writeMetrics = new ShuffleWriteMetrics() val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file, new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics) writer.open() writer.close() intercept[IllegalStateException] { writer.open() } } }
Example 13
Source File: StoragePerfTester.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.tools import java.util.concurrent.{CountDownLatch, Executors} import java.util.concurrent.atomic.AtomicLong import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.shuffle.hash.HashShuffleManager import org.apache.spark.util.Utils val numOutputSplits = sys.env.get("NUM_REDUCERS").map(_.toInt).getOrElse(500) val recordLength = 1000 // ~1KB records val totalRecords = dataSizeMb * 1000 val recordsPerMap = totalRecords / numMaps val writeKey = "1" * (recordLength / 2) val writeValue = "1" * (recordLength / 2) val executor = Executors.newFixedThreadPool(numMaps) val conf = new SparkConf() .set("spark.shuffle.compress", "false") .set("spark.shuffle.sync", "true") .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") // This is only used to instantiate a BlockManager. All thread scheduling is done manually. val sc = new SparkContext("local[4]", "Write Tester", conf) val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager] def writeOutputBytes(mapId: Int, total: AtomicLong): Unit = { val shuffle = hashShuffleManager.shuffleBlockResolver.forMapTask(1, mapId, numOutputSplits, new KryoSerializer(sc.conf), new ShuffleWriteMetrics()) val writers = shuffle.writers for (i <- 1 to recordsPerMap) { writers(i % numOutputSplits).write(writeKey, writeValue) } writers.map { w => w.commitAndClose() total.addAndGet(w.fileSegment().length) } shuffle.releaseWriters(true) } val start = System.currentTimeMillis() val latch = new CountDownLatch(numMaps) val totalBytes = new AtomicLong() for (task <- 1 to numMaps) { executor.submit(new Runnable() { override def run(): Unit = { try { writeOutputBytes(task, totalBytes) latch.countDown() } catch { case e: Exception => // scalastyle:off println println("Exception in child thread: " + e + " " + e.getMessage) // scalastyle:on println System.exit(1) } } }) } latch.await() val end = System.currentTimeMillis() val time = (end - start) / 1000.0 val bytesPerSecond = totalBytes.get() / time val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong // scalastyle:off println System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits)) System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile))) System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong))) // scalastyle:on println executor.shutdown() sc.stop() } }
Example 14
Source File: SortShuffleWriter.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.shuffle.sort import org.apache.spark._ import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.scheduler.MapStatus import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter} import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.Utils import org.apache.spark.util.collection.ExternalSorter private[spark] class SortShuffleWriter[K, V, C]( shuffleBlockResolver: IndexShuffleBlockResolver, handle: BaseShuffleHandle[K, V, C], mapId: Int, context: TaskContext) extends ShuffleWriter[K, V] with Logging { private val dep = handle.dependency private val blockManager = SparkEnv.get.blockManager private var sorter: ExternalSorter[K, V, _] = null // Are we in the process of stopping? Because map tasks can call stop() with success = true // and then call stop() with success = false if they get an exception, we want to make sure // we don't try deleting files, etc twice. private var stopping = false private var mapStatus: MapStatus = null private val writeMetrics = new ShuffleWriteMetrics() context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics) override def stop(success: Boolean): Option[MapStatus] = { try { if (stopping) { return None } stopping = true if (success) { return Option(mapStatus) } else { // The map task failed, so delete our output data. shuffleBlockResolver.removeDataByMap(dep.shuffleId, mapId) return None } } finally { // Clean up our sorter, which may have its own intermediate files if (sorter != null) { val startTime = System.nanoTime() sorter.stop() context.taskMetrics.shuffleWriteMetrics.foreach( _.incShuffleWriteTime(System.nanoTime - startTime)) sorter = null } } } } private[spark] object SortShuffleWriter { def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = { // We cannot bypass sorting if we need to do map-side aggregation. if (dep.mapSideCombine) { require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!") false } else { val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200) dep.partitioner.numPartitions <= bypassMergeThreshold } } }