org.apache.spark.executor.ShuffleWriteMetrics Scala Examples

The following examples show how to use org.apache.spark.executor.ShuffleWriteMetrics. You can vote up the ones you like or vote down the ones you don't like, and go to the original project or source file by following the links above each example.
Example 1
Source File: FutureTaskNotifier.scala    From drizzle-spark   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.scheduler

import org.apache.spark._
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.internal.Logging
import org.apache.spark.storage.BlockManagerId
import org.apache.spark.storage.ShuffleBlockId
import org.apache.spark.storage.StorageLevel


private[spark] object FutureTaskNotifier extends Logging {

  def taskCompleted(
      status: MapStatus,
      mapId: Int,
      shuffleId: Int,
      numReduces: Int,
      nextStageLocs: Option[Seq[BlockManagerId]],
      shuffleWriteMetrics: ShuffleWriteMetrics,
      skipZeroByteNotifications: Boolean): Unit = {
    if (!nextStageLocs.isEmpty && numReduces == nextStageLocs.get.length) {
      val drizzleRpcsStart = System.nanoTime
      sendMapStatusToNextTaskLocations(status, mapId, shuffleId, numReduces, nextStageLocs,
        skipZeroByteNotifications)
      shuffleWriteMetrics.incWriteTime(System.nanoTime -
        drizzleRpcsStart)
    } else {
      logInfo(
        s"No taskCompletion next: ${nextStageLocs.map(_.length).getOrElse(0)} r: $numReduces")
    }
  }

  // Push metadata saying that this map task finished, so that the tasks in the next stage
  // know they can begin pulling the data.
  private def sendMapStatusToNextTaskLocations(
      status: MapStatus,
      mapId: Int,
      shuffleId: Int,
      numReduces: Int,
      nextStageLocs: Option[Seq[BlockManagerId]],
      skipZeroByteNotifications: Boolean) {
    val numReduces = nextStageLocs.get.length
    val uniqueLocations = if (skipZeroByteNotifications) {
      nextStageLocs.get.zipWithIndex.filter { x =>
        status.getSizeForBlock(x._2) != 0L
      }.map(_._1).toSet
    } else {
      nextStageLocs.get.toSet
    }
    uniqueLocations.foreach { blockManagerId =>
      try {
        SparkEnv.get.blockManager.blockTransferService.mapOutputReady(
          blockManagerId.host,
          blockManagerId.port,
          shuffleId,
          mapId,
          numReduces,
          status)
      } catch {
        case e: Exception =>
          logWarning(s"Failed to send map outputs to $blockManagerId", e)
      }
    }
  }

} 
Example 2
Source File: SplashShuffleWriter.scala    From splash   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle

import org.apache.spark.TaskContext
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.internal.Logging
import org.apache.spark.scheduler.MapStatus
import org.apache.spark.storage.ShuffleBlockId


  override def stop(success: Boolean): Option[MapStatus] = {
    try {
      if (stopping) {
        None
      } else {
        stopping = true
        if (success) {
          Option(MapStatus(resolver.blockManagerId, partitionLengths))
        } else {
          None
        }
      }
    } finally {
      if (sorter != null) {
        val startTime = System.nanoTime
        sorter.stop()
        writeMetrics.incWriteTime(System.nanoTime - startTime)
        sorter = null
      }
    }
  }
} 
Example 3
Source File: RemoteShuffleUtils.scala    From OAP   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.remote

import java.util.UUID

import org.apache.hadoop.fs.Path
import org.apache.spark.SparkEnv
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.serializer.{SerializerInstance, SerializerManager}
import org.apache.spark.shuffle.ShuffleWriteMetricsReporter
import org.apache.spark.storage.{BlockId, TempLocalBlockId, TempShuffleBlockId}

object RemoteShuffleUtils {

  val env = SparkEnv.get

  
  def getRemoteWriter(
      blockId: BlockId,
      file: Path,
      serializerManager: SerializerManager,
      serializerInstance: SerializerInstance,
      bufferSize: Int,
      writeMetrics: ShuffleWriteMetricsReporter): RemoteBlockObjectWriter = {
    val syncWrites = false // env.blockManager.conf.getBoolean("spark.shuffle.sync", false)
    new RemoteBlockObjectWriter(file, serializerManager, serializerInstance, bufferSize,
      syncWrites, writeMetrics, blockId)
  }

} 
Example 4
Source File: HashShuffleWriter.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.hash

import org.apache.spark._
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.scheduler.MapStatus
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle._
import org.apache.spark.storage.BlockObjectWriter

private[spark] class HashShuffleWriter[K, V](
    shuffleBlockManager: FileShuffleBlockManager,
    handle: BaseShuffleHandle[K, V, _],
    mapId: Int,
    context: TaskContext)
  extends ShuffleWriter[K, V] with Logging {

  private val dep = handle.dependency
  private val numOutputSplits = dep.partitioner.numPartitions
  private val metrics = context.taskMetrics

  // Are we in the process of stopping? Because map tasks can call stop() with success = true
  // and then call stop() with success = false if they get an exception, we want to make sure
  // we don't try deleting files, etc twice.
  private var stopping = false

  private val writeMetrics = new ShuffleWriteMetrics()
  metrics.shuffleWriteMetrics = Some(writeMetrics)

  private val blockManager = SparkEnv.get.blockManager
  private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null))
  private val shuffle = shuffleBlockManager.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser,
    writeMetrics)

  
  override def stop(initiallySuccess: Boolean): Option[MapStatus] = {
    var success = initiallySuccess
    try {
      if (stopping) {
        return None
      }
      stopping = true
      if (success) {
        try {
          Some(commitWritesAndBuildStatus())
        } catch {
          case e: Exception =>
            success = false
            revertWrites()
            throw e
        }
      } else {
        revertWrites()
        None
      }
    } finally {
      // Release the writers back to the shuffle block manager.
      if (shuffle != null && shuffle.writers != null) {
        try {
          shuffle.releaseWriters(success)
        } catch {
          case e: Exception => logError("Failed to release shuffle writers", e)
        }
      }
    }
  }

  private def commitWritesAndBuildStatus(): MapStatus = {
    // Commit the writes. Get the size of each bucket block (total block size).
    val sizes: Array[Long] = shuffle.writers.map { writer: BlockObjectWriter =>
      writer.commitAndClose()
      writer.fileSegment().length
    }
    MapStatus(blockManager.shuffleServerId, sizes)
  }

  private def revertWrites(): Unit = {
    if (shuffle != null && shuffle.writers != null) {
      for (writer <- shuffle.writers) {
        writer.revertPartialWritesAndClose()
      }
    }
  }
} 
Example 5
Source File: SortShuffleWriter.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.sort

import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.scheduler.MapStatus
import org.apache.spark.shuffle.{IndexShuffleBlockManager, ShuffleWriter, BaseShuffleHandle}
import org.apache.spark.storage.ShuffleBlockId
import org.apache.spark.util.collection.ExternalSorter

private[spark] class SortShuffleWriter[K, V, C](
    shuffleBlockManager: IndexShuffleBlockManager,
    handle: BaseShuffleHandle[K, V, C],
    mapId: Int,
    context: TaskContext)
  extends ShuffleWriter[K, V] with Logging {

  private val dep = handle.dependency

  private val blockManager = SparkEnv.get.blockManager

  private var sorter: ExternalSorter[K, V, _] = null

  // Are we in the process of stopping? Because map tasks can call stop() with success = true
  // and then call stop() with success = false if they get an exception, we want to make sure
  // we don't try deleting files, etc twice.
  private var stopping = false

  private var mapStatus: MapStatus = null

  private val writeMetrics = new ShuffleWriteMetrics()
  context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics)

  
  override def stop(success: Boolean): Option[MapStatus] = {
    try {
      if (stopping) {
        return None
      }
      stopping = true
      if (success) {
        return Option(mapStatus)
      } else {
        // The map task failed, so delete our output data.
        shuffleBlockManager.removeDataByMap(dep.shuffleId, mapId)
        return None
      }
    } finally {
      // Clean up our sorter, which may have its own intermediate files
      if (sorter != null) {
        sorter.stop()
        sorter = null
      }
    }
  }
} 
Example 6
Source File: HashShuffleManagerSuite.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.hash

import java.io.{File, FileWriter}

import scala.language.reflectiveCalls

import org.scalatest.FunSuite

import org.apache.spark.{SparkEnv, SparkContext, LocalSparkContext, SparkConf}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.shuffle.FileShuffleBlockManager
import org.apache.spark.storage.{ShuffleBlockId, FileSegment}

class HashShuffleManagerSuite extends FunSuite with LocalSparkContext {
  private val testConf = new SparkConf(false)

  private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) {
    assert(buffer.isInstanceOf[FileSegmentManagedBuffer])
    val segment = buffer.asInstanceOf[FileSegmentManagedBuffer]
    assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath)
    assert(expected.offset === segment.getOffset)
    assert(expected.length === segment.getLength)
  }

  test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") {

    val conf = new SparkConf(false)
    // reset after EACH object write. This is to ensure that there are bytes appended after
    // an object is written. So if the codepaths assume writeObject is end of data, this should
    // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc.
    conf.set("spark.serializer.objectStreamReset", "1")
    conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")

    sc = new SparkContext("local", "test", conf)

    val shuffleBlockManager =
      SparkEnv.get.shuffleManager.shuffleBlockManager.asInstanceOf[FileShuffleBlockManager]

    val shuffle1 = shuffleBlockManager.forMapTask(1, 1, 1, new JavaSerializer(conf),
      new ShuffleWriteMetrics)
    for (writer <- shuffle1.writers) {
      writer.write("test1")
      writer.write("test2")
    }
    for (writer <- shuffle1.writers) {
      writer.commitAndClose()
    }

    val shuffle1Segment = shuffle1.writers(0).fileSegment()
    shuffle1.releaseWriters(success = true)

    val shuffle2 = shuffleBlockManager.forMapTask(1, 2, 1, new JavaSerializer(conf),
      new ShuffleWriteMetrics)

    for (writer <- shuffle2.writers) {
      writer.write("test3")
      writer.write("test4")
    }
    for (writer <- shuffle2.writers) {
      writer.commitAndClose()
    }
    val shuffle2Segment = shuffle2.writers(0).fileSegment()
    shuffle2.releaseWriters(success = true)

    // Now comes the test :
    // Write to shuffle 3; and close it, but before registering it, check if the file lengths for
    // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length
    // of block based on remaining data in file : which could mess things up when there is concurrent read
    // and writes happening to the same shuffle group.

    val shuffle3 = shuffleBlockManager.forMapTask(1, 3, 1, new JavaSerializer(testConf),
      new ShuffleWriteMetrics)
    for (writer <- shuffle3.writers) {
      writer.write("test3")
      writer.write("test4")
    }
    for (writer <- shuffle3.writers) {
      writer.commitAndClose()
    }
    // check before we register.
    checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0)))
    shuffle3.releaseWriters(success = true)
    checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0)))
    shuffleBlockManager.removeShuffle(1)
  }

  def writeToFile(file: File, numBytes: Int) {
    val writer = new FileWriter(file, true)
    for (i <- 0 until numBytes) writer.write(i)
    writer.close()
  }
} 
Example 7
Source File: BlockObjectWriterSuite.scala    From SparkCore   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.storage

import org.scalatest.FunSuite
import java.io.File
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.SparkConf

class BlockObjectWriterSuite extends FunSuite {
  test("verify write metrics") {
    val file = new File("somefile")
    file.deleteOnExit()
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)

    writer.write(Long.box(20))
    // Record metrics update on every write
    assert(writeMetrics.shuffleRecordsWritten === 1)
    // Metrics don't update on every write
    assert(writeMetrics.shuffleBytesWritten == 0)
    // After 32 writes, metrics should update
    for (i <- 0 until 32) {
      writer.flush()
      writer.write(Long.box(i))
    }
    assert(writeMetrics.shuffleBytesWritten > 0)
    assert(writeMetrics.shuffleRecordsWritten === 33)
    writer.commitAndClose()
    assert(file.length() == writeMetrics.shuffleBytesWritten)
  }

  test("verify write metrics on revert") {
    val file = new File("somefile")
    file.deleteOnExit()
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)

    writer.write(Long.box(20))
    // Record metrics update on every write
    assert(writeMetrics.shuffleRecordsWritten === 1)
    // Metrics don't update on every write
    assert(writeMetrics.shuffleBytesWritten == 0)
    // After 32 writes, metrics should update
    for (i <- 0 until 32) {
      writer.flush()
      writer.write(Long.box(i))
    }
    assert(writeMetrics.shuffleBytesWritten > 0)
    assert(writeMetrics.shuffleRecordsWritten === 33)
    writer.revertPartialWritesAndClose()
    assert(writeMetrics.shuffleBytesWritten == 0)
    assert(writeMetrics.shuffleRecordsWritten == 0)
  }

  test("Reopening a closed block writer") {
    val file = new File("somefile")
    file.deleteOnExit()
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)

    writer.open()
    writer.close()
    intercept[IllegalStateException] {
      writer.open()
    }
  }
} 
Example 8
Source File: StoragePerfTester.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.tools

import java.util.concurrent.{CountDownLatch, Executors}
import java.util.concurrent.atomic.AtomicLong

import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.shuffle.hash.HashShuffleManager
import org.apache.spark.util.Utils


    val numOutputSplits = sys.env.get("NUM_REDUCERS").map(_.toInt).getOrElse(500)

    val recordLength = 1000 // ~1KB records
    val totalRecords = dataSizeMb * 1000
    val recordsPerMap = totalRecords / numMaps

    val writeKey = "1" * (recordLength / 2)
    val writeValue = "1" * (recordLength / 2)
    val executor = Executors.newFixedThreadPool(numMaps)

    val conf = new SparkConf()
      .set("spark.shuffle.compress", "false")
      .set("spark.shuffle.sync", "true")
      .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")

    // This is only used to instantiate a BlockManager. All thread scheduling is done manually.
    val sc = new SparkContext("local[4]", "Write Tester", conf)
    val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager]

    def writeOutputBytes(mapId: Int, total: AtomicLong): Unit = {
      val shuffle = hashShuffleManager.shuffleBlockResolver.forMapTask(1, mapId, numOutputSplits,
        new KryoSerializer(sc.conf), new ShuffleWriteMetrics())
      val writers = shuffle.writers
      for (i <- 1 to recordsPerMap) {
        writers(i % numOutputSplits).write(writeKey, writeValue)
      }
      writers.map { w =>
        w.commitAndClose()
        total.addAndGet(w.fileSegment().length)
      }

      shuffle.releaseWriters(true)
    }

    val start = System.currentTimeMillis()
    val latch = new CountDownLatch(numMaps)
    val totalBytes = new AtomicLong()
    for (task <- 1 to numMaps) {
      executor.submit(new Runnable() {
        override def run(): Unit = {
          try {
            writeOutputBytes(task, totalBytes)
            latch.countDown()
          } catch {
            case e: Exception =>
              println("Exception in child thread: " + e + " " + e.getMessage)
              System.exit(1)
          }
        }
      })
    }
    latch.await()
    val end = System.currentTimeMillis()
    val time = (end - start) / 1000.0
    val bytesPerSecond = totalBytes.get() / time
    val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong

    System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits))
    System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile)))
    System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong)))

    executor.shutdown()
    sc.stop()
  }
} 
Example 9
Source File: HashShuffleWriter.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.hash

import org.apache.spark._
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.scheduler.MapStatus
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle._
import org.apache.spark.storage.BlockObjectWriter

private[spark] class HashShuffleWriter[K, V](
    shuffleBlockResolver: FileShuffleBlockResolver,
    handle: BaseShuffleHandle[K, V, _],
    mapId: Int,
    context: TaskContext)
  extends ShuffleWriter[K, V] with Logging {

  private val dep = handle.dependency
  private val numOutputSplits = dep.partitioner.numPartitions
  private val metrics = context.taskMetrics

  // Are we in the process of stopping? Because map tasks can call stop() with success = true
  // and then call stop() with success = false if they get an exception, we want to make sure
  // we don't try deleting files, etc twice.
  private var stopping = false

  private val writeMetrics = new ShuffleWriteMetrics()
  metrics.shuffleWriteMetrics = Some(writeMetrics)

  private val blockManager = SparkEnv.get.blockManager
  private val ser = Serializer.getSerializer(dep.serializer.getOrElse(null))
  private val shuffle = shuffleBlockResolver.forMapTask(dep.shuffleId, mapId, numOutputSplits, ser,
    writeMetrics)

  
  override def stop(initiallySuccess: Boolean): Option[MapStatus] = {
    var success = initiallySuccess
    try {
      if (stopping) {
        return None
      }
      stopping = true
      if (success) {
        try {
          Some(commitWritesAndBuildStatus())
        } catch {
          case e: Exception =>
            success = false
            revertWrites()
            throw e
        }
      } else {
        revertWrites()
        None
      }
    } finally {
      // Release the writers back to the shuffle block manager.
      if (shuffle != null && shuffle.writers != null) {
        try {
          shuffle.releaseWriters(success)
        } catch {
          case e: Exception => logError("Failed to release shuffle writers", e)
        }
      }
    }
  }

  private def commitWritesAndBuildStatus(): MapStatus = {
    // Commit the writes. Get the size of each bucket block (total block size).
    val sizes: Array[Long] = shuffle.writers.map { writer: BlockObjectWriter =>
      writer.commitAndClose()
      writer.fileSegment().length
    }
    MapStatus(blockManager.shuffleServerId, sizes)
  }

  private def revertWrites(): Unit = {
    if (shuffle != null && shuffle.writers != null) {
      for (writer <- shuffle.writers) {
        writer.revertPartialWritesAndClose()
      }
    }
  }
} 
Example 10
Source File: SortShuffleWriter.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.sort

import org.apache.spark.{MapOutputTracker, SparkEnv, Logging, TaskContext}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.scheduler.MapStatus
import org.apache.spark.shuffle.{IndexShuffleBlockResolver, ShuffleWriter, BaseShuffleHandle}
import org.apache.spark.storage.ShuffleBlockId
import org.apache.spark.util.collection.ExternalSorter

private[spark] class SortShuffleWriter[K, V, C](
    shuffleBlockResolver: IndexShuffleBlockResolver,
    handle: BaseShuffleHandle[K, V, C],
    mapId: Int,
    context: TaskContext)
  extends ShuffleWriter[K, V] with Logging {

  private val dep = handle.dependency

  private val blockManager = SparkEnv.get.blockManager

  private var sorter: ExternalSorter[K, V, _] = null

  // Are we in the process of stopping? Because map tasks can call stop() with success = true
  // and then call stop() with success = false if they get an exception, we want to make sure
  // we don't try deleting files, etc twice.
  private var stopping = false

  private var mapStatus: MapStatus = null

  private val writeMetrics = new ShuffleWriteMetrics()
  context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics)

  
  override def stop(success: Boolean): Option[MapStatus] = {
    try {
      if (stopping) {
        return None
      }
      stopping = true
      if (success) {
        return Option(mapStatus)
      } else {
        // The map task failed, so delete our output data.
        shuffleBlockResolver.removeDataByMap(dep.shuffleId, mapId)
        return None
      }
    } finally {
      // Clean up our sorter, which may have its own intermediate files
      if (sorter != null) {
        val startTime = System.nanoTime()
        sorter.stop()
        context.taskMetrics.shuffleWriteMetrics.foreach(
          _.incShuffleWriteTime(System.nanoTime - startTime))
        sorter = null
      }
    }
  }
} 
Example 11
Source File: HashShuffleManagerSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.hash

import java.io.{File, FileWriter}

import scala.language.reflectiveCalls

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.shuffle.FileShuffleBlockResolver
import org.apache.spark.storage.{ShuffleBlockId, FileSegment}

class HashShuffleManagerSuite extends SparkFunSuite with LocalSparkContext {
  private val testConf = new SparkConf(false)

  private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) {
    assert(buffer.isInstanceOf[FileSegmentManagedBuffer])
    val segment = buffer.asInstanceOf[FileSegmentManagedBuffer]
    assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath)
    assert(expected.offset === segment.getOffset)
    assert(expected.length === segment.getLength)
  }

  test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") {

    val conf = new SparkConf(false)
    // reset after EACH object write. This is to ensure that there are bytes appended after
    // an object is written. So if the codepaths assume writeObject is end of data, this should
    // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc.
    conf.set("spark.serializer.objectStreamReset", "1")
    conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")

    sc = new SparkContext("local", "test", conf)

    val shuffleBlockResolver =
      SparkEnv.get.shuffleManager.shuffleBlockResolver.asInstanceOf[FileShuffleBlockResolver]

    val shuffle1 = shuffleBlockResolver.forMapTask(1, 1, 1, new JavaSerializer(conf),
      new ShuffleWriteMetrics)
    for (writer <- shuffle1.writers) {
      writer.write("test1", "value")
      writer.write("test2", "value")
    }
    for (writer <- shuffle1.writers) {
      writer.commitAndClose()
    }

    val shuffle1Segment = shuffle1.writers(0).fileSegment()
    shuffle1.releaseWriters(success = true)

    val shuffle2 = shuffleBlockResolver.forMapTask(1, 2, 1, new JavaSerializer(conf),
      new ShuffleWriteMetrics)

    for (writer <- shuffle2.writers) {
      writer.write("test3", "value")
      writer.write("test4", "vlue")
    }
    for (writer <- shuffle2.writers) {
      writer.commitAndClose()
    }
    val shuffle2Segment = shuffle2.writers(0).fileSegment()
    shuffle2.releaseWriters(success = true)

    // Now comes the test :
    // Write to shuffle 3; and close it, but before registering it, check if the file lengths for
    // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length
    // of block based on remaining data in file : which could mess things up when there is
    // concurrent read and writes happening to the same shuffle group.

    val shuffle3 = shuffleBlockResolver.forMapTask(1, 3, 1, new JavaSerializer(testConf),
      new ShuffleWriteMetrics)
    for (writer <- shuffle3.writers) {
      writer.write("test3", "value")
      writer.write("test4", "value")
    }
    for (writer <- shuffle3.writers) {
      writer.commitAndClose()
    }
    // check before we register.
    checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0)))
    shuffle3.releaseWriters(success = true)
    checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0)))
    shuffleBlockResolver.removeShuffle(1)
  }

  def writeToFile(file: File, numBytes: Int) {
    val writer = new FileWriter(file, true)
    for (i <- 0 until numBytes) writer.write(i)
    writer.close()
  }
} 
Example 12
Source File: BlockObjectWriterSuite.scala    From iolap   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.storage

import java.io.File

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.util.Utils

class BlockObjectWriterSuite extends SparkFunSuite {
  test("verify write metrics") {
    val file = new File(Utils.createTempDir(), "somefile")
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)

    writer.write(Long.box(20), Long.box(30))
    // Record metrics update on every write
    assert(writeMetrics.shuffleRecordsWritten === 1)
    // Metrics don't update on every write
    assert(writeMetrics.shuffleBytesWritten == 0)
    // After 32 writes, metrics should update
    for (i <- 0 until 32) {
      writer.flush()
      writer.write(Long.box(i), Long.box(i))
    }
    assert(writeMetrics.shuffleBytesWritten > 0)
    assert(writeMetrics.shuffleRecordsWritten === 33)
    writer.commitAndClose()
    assert(file.length() == writeMetrics.shuffleBytesWritten)
  }

  test("verify write metrics on revert") {
    val file = new File(Utils.createTempDir(), "somefile")
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)

    writer.write(Long.box(20), Long.box(30))
    // Record metrics update on every write
    assert(writeMetrics.shuffleRecordsWritten === 1)
    // Metrics don't update on every write
    assert(writeMetrics.shuffleBytesWritten == 0)
    // After 32 writes, metrics should update
    for (i <- 0 until 32) {
      writer.flush()
      writer.write(Long.box(i), Long.box(i))
    }
    assert(writeMetrics.shuffleBytesWritten > 0)
    assert(writeMetrics.shuffleRecordsWritten === 33)
    writer.revertPartialWritesAndClose()
    assert(writeMetrics.shuffleBytesWritten == 0)
    assert(writeMetrics.shuffleRecordsWritten == 0)
  }

  test("Reopening a closed block writer") {
    val file = new File(Utils.createTempDir(), "somefile")
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)

    writer.open()
    writer.close()
    intercept[IllegalStateException] {
      writer.open()
    }
  }
} 
Example 13
Source File: StoragePerfTester.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.tools

import java.util.concurrent.{CountDownLatch, Executors}
import java.util.concurrent.atomic.AtomicLong

import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.shuffle.hash.HashShuffleManager
import org.apache.spark.util.Utils


    val numOutputSplits = sys.env.get("NUM_REDUCERS").map(_.toInt).getOrElse(500)

    val recordLength = 1000 // ~1KB records
    val totalRecords = dataSizeMb * 1000
    val recordsPerMap = totalRecords / numMaps

    val writeKey = "1" * (recordLength / 2)
    val writeValue = "1" * (recordLength / 2)
    val executor = Executors.newFixedThreadPool(numMaps)

    val conf = new SparkConf()
      .set("spark.shuffle.compress", "false")
      .set("spark.shuffle.sync", "true")
      .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")

    // This is only used to instantiate a BlockManager. All thread scheduling is done manually.
    val sc = new SparkContext("local[4]", "Write Tester", conf)
    val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager]

    def writeOutputBytes(mapId: Int, total: AtomicLong): Unit = {
      val shuffle = hashShuffleManager.shuffleBlockResolver.forMapTask(1, mapId, numOutputSplits,
        new KryoSerializer(sc.conf), new ShuffleWriteMetrics())
      val writers = shuffle.writers
      for (i <- 1 to recordsPerMap) {
        writers(i % numOutputSplits).write(writeKey, writeValue)
      }
      writers.map { w =>
        w.commitAndClose()
        total.addAndGet(w.fileSegment().length)
      }

      shuffle.releaseWriters(true)
    }

    val start = System.currentTimeMillis()
    val latch = new CountDownLatch(numMaps)
    val totalBytes = new AtomicLong()
    for (task <- 1 to numMaps) {
      executor.submit(new Runnable() {
        override def run(): Unit = {
          try {
            writeOutputBytes(task, totalBytes)
            latch.countDown()
          } catch {
            case e: Exception =>
              // scalastyle:off println
              println("Exception in child thread: " + e + " " + e.getMessage)
              // scalastyle:on println
              System.exit(1)
          }
        }
      })
    }
    latch.await()
    val end = System.currentTimeMillis()
    val time = (end - start) / 1000.0
    val bytesPerSecond = totalBytes.get() / time
    val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong

    // scalastyle:off println
    System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits))
    System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile)))
    System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong)))
    // scalastyle:on println

    executor.shutdown()
    sc.stop()
  }
} 
Example 14
Source File: SortShuffleWriter.scala    From BigDatalog   with Apache License 2.0 5 votes vote down vote up
package org.apache.spark.shuffle.sort

import org.apache.spark._
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.scheduler.MapStatus
import org.apache.spark.shuffle.{BaseShuffleHandle, IndexShuffleBlockResolver, ShuffleWriter}
import org.apache.spark.storage.ShuffleBlockId
import org.apache.spark.util.Utils
import org.apache.spark.util.collection.ExternalSorter

private[spark] class SortShuffleWriter[K, V, C](
    shuffleBlockResolver: IndexShuffleBlockResolver,
    handle: BaseShuffleHandle[K, V, C],
    mapId: Int,
    context: TaskContext)
  extends ShuffleWriter[K, V] with Logging {

  private val dep = handle.dependency

  private val blockManager = SparkEnv.get.blockManager

  private var sorter: ExternalSorter[K, V, _] = null

  // Are we in the process of stopping? Because map tasks can call stop() with success = true
  // and then call stop() with success = false if they get an exception, we want to make sure
  // we don't try deleting files, etc twice.
  private var stopping = false

  private var mapStatus: MapStatus = null

  private val writeMetrics = new ShuffleWriteMetrics()
  context.taskMetrics.shuffleWriteMetrics = Some(writeMetrics)

  
  override def stop(success: Boolean): Option[MapStatus] = {
    try {
      if (stopping) {
        return None
      }
      stopping = true
      if (success) {
        return Option(mapStatus)
      } else {
        // The map task failed, so delete our output data.
        shuffleBlockResolver.removeDataByMap(dep.shuffleId, mapId)
        return None
      }
    } finally {
      // Clean up our sorter, which may have its own intermediate files
      if (sorter != null) {
        val startTime = System.nanoTime()
        sorter.stop()
        context.taskMetrics.shuffleWriteMetrics.foreach(
          _.incShuffleWriteTime(System.nanoTime - startTime))
        sorter = null
      }
    }
  }
}

private[spark] object SortShuffleWriter {
  def shouldBypassMergeSort(conf: SparkConf, dep: ShuffleDependency[_, _, _]): Boolean = {
    // We cannot bypass sorting if we need to do map-side aggregation.
    if (dep.mapSideCombine) {
      require(dep.aggregator.isDefined, "Map-side combine without Aggregator specified!")
      false
    } else {
      val bypassMergeThreshold: Int = conf.getInt("spark.shuffle.sort.bypassMergeThreshold", 200)
      dep.partitioner.numPartitions <= bypassMergeThreshold
    }
  }
}