org.apache.spark.serializer.JavaSerializer Scala Example

Source File: HashMapParam.scala From cuesheet with Apache License 2.0

5 votes

package com.kakao.cuesheet.accumulator

import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.{AccumulableParam, SparkConf}
import spire.math.Numeric

import scala.collection.mutable.{HashMap => MutableHashMap}


case class HashMapParam[K, V: Numeric]() extends AccumulableParam[MutableHashMap[K, V], (K, V)] {

  private val add = implicitly[Numeric[V]].additive.op _

  def addAccumulator(acc: MutableHashMap[K, V], elem: (K, V)): MutableHashMap[K, V] = {
    val (k1, v1) = elem
    acc += acc.find(_._1 == k1).map {
      case (k2, v2) => k2 -> add(v1, v2)
    }.getOrElse(elem)

    acc
  }

  def addInPlace(acc1: MutableHashMap[K, V], acc2: MutableHashMap[K, V]): MutableHashMap[K, V] = {
    acc2.foreach(elem => addAccumulator(acc1, elem))
    acc1
  }

  def zero(initialValue: MutableHashMap[K, V]): MutableHashMap[K, V] = {
    val ser = new JavaSerializer(new SparkConf(false)).newInstance()
    val copy = ser.deserialize[MutableHashMap[K, V]](ser.serialize(initialValue))
    copy.clear()
    copy
  }
}

Source File: SortShuffleSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: SortShuffleSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    // Once 'spark.local.dir' is set, it is cached. Unless this is manually cleared
    // before/after a test, it could return the same directory even if this property
    // is configured.
    Utils.clearLocalRootDirs()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
      Utils.clearLocalRootDirs()
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: YarnSchedulerBackendSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler.cluster

import scala.language.reflectiveCalls

import org.mockito.Mockito.when
import org.scalatest.mockito.MockitoSugar

import org.apache.spark.{LocalSparkContext, SparkContext, SparkFunSuite}
import org.apache.spark.scheduler.TaskSchedulerImpl
import org.apache.spark.serializer.JavaSerializer

class YarnSchedulerBackendSuite extends SparkFunSuite with MockitoSugar with LocalSparkContext {

  test("RequestExecutors reflects node blacklist and is serializable") {
    sc = new SparkContext("local", "YarnSchedulerBackendSuite")
    val sched = mock[TaskSchedulerImpl]
    when(sched.sc).thenReturn(sc)
    val yarnSchedulerBackend = new YarnSchedulerBackend(sched, sc) {
      def setHostToLocalTaskCount(hostToLocalTaskCount: Map[String, Int]): Unit = {
        this.hostToLocalTaskCount = hostToLocalTaskCount
      }
    }
    val ser = new JavaSerializer(sc.conf).newInstance()
    for {
      blacklist <- IndexedSeq(Set[String](), Set("a", "b", "c"))
      numRequested <- 0 until 10
      hostToLocalCount <- IndexedSeq(
        Map[String, Int](),
        Map("a" -> 1, "b" -> 2)
      )
    } {
      yarnSchedulerBackend.setHostToLocalTaskCount(hostToLocalCount)
      when(sched.nodeBlacklist()).thenReturn(blacklist)
      val req = yarnSchedulerBackend.prepareRequestExecutors(numRequested)
      assert(req.requestedTotal === numRequested)
      assert(req.nodeBlacklist === blacklist)
      assert(req.hostToLocalTaskCount.keySet.intersect(blacklist).isEmpty)
      // Serialize to make sure serialization doesn't throw an error
      ser.serialize(req)
    }
    sc.stop()
  }

}

Source File: MapStatusSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.storage.BlockManagerId

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.JavaSerializer

import scala.util.Random

class MapStatusSuite extends SparkFunSuite {

  test("compressSize") {//compress 压缩大小
    assert(MapStatus.compressSize(0L) === 0)
    assert(MapStatus.compressSize(1L) === 1)
    assert(MapStatus.compressSize(2L) === 8)
    assert(MapStatus.compressSize(10L) === 25)
    assert((MapStatus.compressSize(1000000L) & 0xFF) === 145)
    assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218)
    // This last size is bigger than we can encode in a byte, so check that we just return 255
    //这最后一个大小字节编码,所以检查返回255
    assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255)
  }

  test("decompressSize") {//解压缩的大小
    assert(MapStatus.decompressSize(0) === 0)
    for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) {
      val size2 = MapStatus.decompressSize(MapStatus.compressSize(size))
      assert(size2 >= 0.99 * size && size2 <= 1.11 * size,
        "size " + size + " decompressed to " + size2 + ", which is out of range")
    }
  }
  //MapStatus 不应该报告非空块的大小为0
  test("MapStatus should never report non-empty blocks' sizes as 0") {
    import Math._
    for (
      numSizes <- Seq(1, 10, 100, 1000, 10000);
      mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong);
      stddev <- Seq(0.0, 0.01, 0.5, 1.0)
    ) {
      val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean)
      val status = MapStatus(BlockManagerId("a", "b", 10), sizes)
      val status1 = compressAndDecompressMapStatus(status)
      for (i <- 0 until numSizes) {
        if (sizes(i) != 0) {
          val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev"
          assert(status.getSizeForBlock(i) !== 0, failureMessage)
          assert(status1.getSizeForBlock(i) !== 0, failureMessage)
        }
      }
    }
  }
//大型任务应该使用
  test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) {
    val sizes = Array.fill[Long](2001)(150L)
    val status = MapStatus(null, sizes)
    assert(status.isInstanceOf[HighlyCompressedMapStatus])
    assert(status.getSizeForBlock(10) === 150L)
    assert(status.getSizeForBlock(50) === 150L)
    assert(status.getSizeForBlock(99) === 150L)
    assert(status.getSizeForBlock(2000) === 150L)
  }
  //高度压缩的Map状态:估计的大小应该是平均非空块大小
  test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") {
    val sizes = Array.tabulate[Long](3000) { i => i.toLong }
    val avg = sizes.sum / sizes.filter(_ != 0).length
    val loc = BlockManagerId("a", "b", 10)
    val status = MapStatus(loc, sizes)
    val status1 = compressAndDecompressMapStatus(status)
    assert(status1.isInstanceOf[HighlyCompressedMapStatus])
    assert(status1.location == loc)
    for (i <- 0 until 3000) {
      val estimate = status1.getSizeForBlock(i)
      if (sizes(i) > 0) {
        assert(estimate === avg)
      }
    }
  }

  def compressAndDecompressMapStatus(status: MapStatus): MapStatus = {
    val ser = new JavaSerializer(new SparkConf)
    val buf = ser.newInstance().serialize(status)
    ser.newInstance().deserialize[MapStatus](buf)
  }
}

Source File: SerializationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.hive

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.JavaSerializer

class SerializationSuite extends SparkFunSuite {
  //HiveContext应该是可序列化的
  test("[SPARK-5840] HiveContext should be serializable") {
    val hiveContext = org.apache.spark.sql.hive.test.TestHive
    hiveContext.hiveconf
    val serializer = new JavaSerializer(new SparkConf()).newInstance()
    val bytes = serializer.serialize(hiveContext)
    val deSer = serializer.deserialize[AnyRef](bytes)
  }
}

Source File: BlockObjectWriterSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.storage

import java.io.File

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.util.Utils

class BlockObjectWriterSuite extends SparkFunSuite {
  test("verify write metrics") {
    val file = new File(Utils.createTempDir(), "somefile")
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)

    writer.write(Long.box(20), Long.box(30))
    // Record metrics update on every write
    assert(writeMetrics.shuffleRecordsWritten === 1)
    // Metrics don't update on every write
    assert(writeMetrics.shuffleBytesWritten == 0)
    // After 32 writes, metrics should update
    for (i <- 0 until 32) {
      writer.flush()
      writer.write(Long.box(i), Long.box(i))
    }
    assert(writeMetrics.shuffleBytesWritten > 0)
    assert(writeMetrics.shuffleRecordsWritten === 33)
    writer.commitAndClose()
    assert(file.length() == writeMetrics.shuffleBytesWritten)
  }

  test("verify write metrics on revert") {
    val file = new File(Utils.createTempDir(), "somefile")
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)

    writer.write(Long.box(20), Long.box(30))
    // Record metrics update on every write
    assert(writeMetrics.shuffleRecordsWritten === 1)
    // Metrics don't update on every write
    assert(writeMetrics.shuffleBytesWritten == 0)
    // After 32 writes, metrics should update
    for (i <- 0 until 32) {
      writer.flush()
      writer.write(Long.box(i), Long.box(i))
    }
    assert(writeMetrics.shuffleBytesWritten > 0)
    assert(writeMetrics.shuffleRecordsWritten === 33)
    writer.revertPartialWritesAndClose()
    assert(writeMetrics.shuffleBytesWritten == 0)
    assert(writeMetrics.shuffleRecordsWritten == 0)
  }

  test("Reopening a closed block writer") {
    val file = new File(Utils.createTempDir(), "somefile")
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()).newInstance(), 1024, os => os, true, writeMetrics)

    writer.open()
    writer.close()
    intercept[IllegalStateException] {
      writer.open()
    }
  }
}

Source File: MapStatusSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.storage.BlockManagerId

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.JavaSerializer

import scala.util.Random

class MapStatusSuite extends SparkFunSuite {

  test("compressSize") {
    assert(MapStatus.compressSize(0L) === 0)
    assert(MapStatus.compressSize(1L) === 1)
    assert(MapStatus.compressSize(2L) === 8)
    assert(MapStatus.compressSize(10L) === 25)
    assert((MapStatus.compressSize(1000000L) & 0xFF) === 145)
    assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218)
    // This last size is bigger than we can encode in a byte, so check that we just return 255
    assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255)
  }

  test("decompressSize") {
    assert(MapStatus.decompressSize(0) === 0)
    for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) {
      val size2 = MapStatus.decompressSize(MapStatus.compressSize(size))
      assert(size2 >= 0.99 * size && size2 <= 1.11 * size,
        "size " + size + " decompressed to " + size2 + ", which is out of range")
    }
  }

  test("MapStatus should never report non-empty blocks' sizes as 0") {
    import Math._
    for (
      numSizes <- Seq(1, 10, 100, 1000, 10000);
      mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong);
      stddev <- Seq(0.0, 0.01, 0.5, 1.0)
    ) {
      val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean)
      val status = MapStatus(BlockManagerId("a", "b", 10), sizes)
      val status1 = compressAndDecompressMapStatus(status)
      for (i <- 0 until numSizes) {
        if (sizes(i) != 0) {
          val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev"
          assert(status.getSizeForBlock(i) !== 0, failureMessage)
          assert(status1.getSizeForBlock(i) !== 0, failureMessage)
        }
      }
    }
  }

  test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) {
    val sizes = Array.fill[Long](2001)(150L)
    val status = MapStatus(null, sizes)
    assert(status.isInstanceOf[HighlyCompressedMapStatus])
    assert(status.getSizeForBlock(10) === 150L)
    assert(status.getSizeForBlock(50) === 150L)
    assert(status.getSizeForBlock(99) === 150L)
    assert(status.getSizeForBlock(2000) === 150L)
  }

  test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") {
    val sizes = Array.tabulate[Long](3000) { i => i.toLong }
    val avg = sizes.sum / sizes.filter(_ != 0).length
    val loc = BlockManagerId("a", "b", 10)
    val status = MapStatus(loc, sizes)
    val status1 = compressAndDecompressMapStatus(status)
    assert(status1.isInstanceOf[HighlyCompressedMapStatus])
    assert(status1.location == loc)
    for (i <- 0 until 3000) {
      val estimate = status1.getSizeForBlock(i)
      if (sizes(i) > 0) {
        assert(estimate === avg)
      }
    }
  }

  def compressAndDecompressMapStatus(status: MapStatus): MapStatus = {
    val ser = new JavaSerializer(new SparkConf)
    val buf = ser.newInstance().serialize(status)
    ser.newInstance().deserialize[MapStatus](buf)
  }
}

Source File: HashShuffleManagerSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.shuffle.hash

import java.io.{File, FileWriter}

import scala.language.reflectiveCalls

import org.apache.spark.{LocalSparkContext, SparkConf, SparkContext, SparkEnv, SparkFunSuite}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.shuffle.FileShuffleBlockResolver
import org.apache.spark.storage.{ShuffleBlockId, FileSegment}

class HashShuffleManagerSuite extends SparkFunSuite with LocalSparkContext {
  private val testConf = new SparkConf(false)

  private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) {
    assert(buffer.isInstanceOf[FileSegmentManagedBuffer])
    val segment = buffer.asInstanceOf[FileSegmentManagedBuffer]
    assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath)
    assert(expected.offset === segment.getOffset)
    assert(expected.length === segment.getLength)
  }

  test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") {

    val conf = new SparkConf(false)
    // reset after EACH object write. This is to ensure that there are bytes appended after
    // an object is written. So if the codepaths assume writeObject is end of data, this should
    // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc.
    conf.set("spark.serializer.objectStreamReset", "1")
    conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")

    sc = new SparkContext("local", "test", conf)

    val shuffleBlockResolver =
      SparkEnv.get.shuffleManager.shuffleBlockResolver.asInstanceOf[FileShuffleBlockResolver]

    val shuffle1 = shuffleBlockResolver.forMapTask(1, 1, 1, new JavaSerializer(conf),
      new ShuffleWriteMetrics)
    for (writer <- shuffle1.writers) {
      writer.write("test1", "value")
      writer.write("test2", "value")
    }
    for (writer <- shuffle1.writers) {
      writer.commitAndClose()
    }

    val shuffle1Segment = shuffle1.writers(0).fileSegment()
    shuffle1.releaseWriters(success = true)

    val shuffle2 = shuffleBlockResolver.forMapTask(1, 2, 1, new JavaSerializer(conf),
      new ShuffleWriteMetrics)

    for (writer <- shuffle2.writers) {
      writer.write("test3", "value")
      writer.write("test4", "vlue")
    }
    for (writer <- shuffle2.writers) {
      writer.commitAndClose()
    }
    val shuffle2Segment = shuffle2.writers(0).fileSegment()
    shuffle2.releaseWriters(success = true)

    // Now comes the test :
    // Write to shuffle 3; and close it, but before registering it, check if the file lengths for
    // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length
    // of block based on remaining data in file : which could mess things up when there is
    // concurrent read and writes happening to the same shuffle group.

    val shuffle3 = shuffleBlockResolver.forMapTask(1, 3, 1, new JavaSerializer(testConf),
      new ShuffleWriteMetrics)
    for (writer <- shuffle3.writers) {
      writer.write("test3", "value")
      writer.write("test4", "value")
    }
    for (writer <- shuffle3.writers) {
      writer.commitAndClose()
    }
    // check before we register.
    checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0)))
    shuffle3.releaseWriters(success = true)
    checkSegments(shuffle2Segment, shuffleBlockResolver.getBlockData(ShuffleBlockId(1, 2, 0)))
    shuffleBlockResolver.removeShuffle(1)
  }

  def writeToFile(file: File, numBytes: Int) {
    val writer = new FileWriter(file, true)
    for (i <- 0 until numBytes) writer.write(i)
    writer.close()
  }
}

Source File: SortShuffleSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: SortShuffleSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: BlockObjectWriterSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.scalatest.FunSuite
import java.io.File
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.SparkConf

class BlockObjectWriterSuite extends FunSuite {
  test("verify write metrics") {
    val file = new File("somefile")
    file.deleteOnExit()
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)

    writer.write(Long.box(20))
    // Record metrics update on every write
    assert(writeMetrics.shuffleRecordsWritten === 1)
    // Metrics don't update on every write
    assert(writeMetrics.shuffleBytesWritten == 0)
    // After 32 writes, metrics should update
    for (i <- 0 until 32) {
      writer.flush()
      writer.write(Long.box(i))
    }
    assert(writeMetrics.shuffleBytesWritten > 0)
    assert(writeMetrics.shuffleRecordsWritten === 33)
    writer.commitAndClose()
    assert(file.length() == writeMetrics.shuffleBytesWritten)
  }

  test("verify write metrics on revert") {
    val file = new File("somefile")
    file.deleteOnExit()
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)

    writer.write(Long.box(20))
    // Record metrics update on every write
    assert(writeMetrics.shuffleRecordsWritten === 1)
    // Metrics don't update on every write
    assert(writeMetrics.shuffleBytesWritten == 0)
    // After 32 writes, metrics should update
    for (i <- 0 until 32) {
      writer.flush()
      writer.write(Long.box(i))
    }
    assert(writeMetrics.shuffleBytesWritten > 0)
    assert(writeMetrics.shuffleRecordsWritten === 33)
    writer.revertPartialWritesAndClose()
    assert(writeMetrics.shuffleBytesWritten == 0)
    assert(writeMetrics.shuffleRecordsWritten == 0)
  }

  test("Reopening a closed block writer") {
    val file = new File("somefile")
    file.deleteOnExit()
    val writeMetrics = new ShuffleWriteMetrics()
    val writer = new DiskBlockObjectWriter(new TestBlockId("0"), file,
      new JavaSerializer(new SparkConf()), 1024, os => os, true, writeMetrics)

    writer.open()
    writer.close()
    intercept[IllegalStateException] {
      writer.open()
    }
  }
}

Source File: MapStatusSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import org.apache.spark.storage.BlockManagerId
import org.scalatest.FunSuite

import org.apache.spark.SparkConf
import org.apache.spark.serializer.JavaSerializer

import scala.util.Random

class MapStatusSuite extends FunSuite {

  test("compressSize") {
    assert(MapStatus.compressSize(0L) === 0)
    assert(MapStatus.compressSize(1L) === 1)
    assert(MapStatus.compressSize(2L) === 8)
    assert(MapStatus.compressSize(10L) === 25)
    assert((MapStatus.compressSize(1000000L) & 0xFF) === 145)
    assert((MapStatus.compressSize(1000000000L) & 0xFF) === 218)
    // This last size is bigger than we can encode in a byte, so check that we just return 255
    assert((MapStatus.compressSize(1000000000000000000L) & 0xFF) === 255)
  }

  test("decompressSize") {
    assert(MapStatus.decompressSize(0) === 0)
    for (size <- Seq(2L, 10L, 100L, 50000L, 1000000L, 1000000000L)) {
      val size2 = MapStatus.decompressSize(MapStatus.compressSize(size))
      assert(size2 >= 0.99 * size && size2 <= 1.11 * size,
        "size " + size + " decompressed to " + size2 + ", which is out of range")
    }
  }

  test("MapStatus should never report non-empty blocks' sizes as 0") {
    import Math._
    for (
      numSizes <- Seq(1, 10, 100, 1000, 10000);
      mean <- Seq(0L, 100L, 10000L, Int.MaxValue.toLong);
      stddev <- Seq(0.0, 0.01, 0.5, 1.0)
    ) {
      val sizes = Array.fill[Long](numSizes)(abs(round(Random.nextGaussian() * stddev)) + mean)
      val status = MapStatus(BlockManagerId("a", "b", 10), sizes)
      val status1 = compressAndDecompressMapStatus(status)
      for (i <- 0 until numSizes) {
        if (sizes(i) != 0) {
          val failureMessage = s"Failed with $numSizes sizes with mean=$mean, stddev=$stddev"
          assert(status.getSizeForBlock(i) !== 0, failureMessage)
          assert(status1.getSizeForBlock(i) !== 0, failureMessage)
        }
      }
    }
  }

  test("large tasks should use " + classOf[HighlyCompressedMapStatus].getName) {
    val sizes = Array.fill[Long](2001)(150L)
    val status = MapStatus(null, sizes)
    assert(status.isInstanceOf[HighlyCompressedMapStatus])
    assert(status.getSizeForBlock(10) === 150L)
    assert(status.getSizeForBlock(50) === 150L)
    assert(status.getSizeForBlock(99) === 150L)
    assert(status.getSizeForBlock(2000) === 150L)
  }

  test("HighlyCompressedMapStatus: estimated size should be the average non-empty block size") {
    val sizes = Array.tabulate[Long](3000) { i => i.toLong }
    val avg = sizes.sum / sizes.filter(_ != 0).length
    val loc = BlockManagerId("a", "b", 10)
    val status = MapStatus(loc, sizes)
    val status1 = compressAndDecompressMapStatus(status)
    assert(status1.isInstanceOf[HighlyCompressedMapStatus])
    assert(status1.location == loc)
    for (i <- 0 until 3000) {
      val estimate = status1.getSizeForBlock(i)
      if (sizes(i) > 0) {
        assert(estimate === avg)
      }
    }
  }

  def compressAndDecompressMapStatus(status: MapStatus): MapStatus = {
    val ser = new JavaSerializer(new SparkConf)
    val buf = ser.newInstance().serialize(status)
    ser.newInstance().deserialize[MapStatus](buf)
  }
}

Source File: HashShuffleManagerSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.shuffle.hash

import java.io.{File, FileWriter}

import scala.language.reflectiveCalls

import org.scalatest.FunSuite

import org.apache.spark.{SparkEnv, SparkContext, LocalSparkContext, SparkConf}
import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.network.buffer.{FileSegmentManagedBuffer, ManagedBuffer}
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.shuffle.FileShuffleBlockManager
import org.apache.spark.storage.{ShuffleBlockId, FileSegment}

class HashShuffleManagerSuite extends FunSuite with LocalSparkContext {
  private val testConf = new SparkConf(false)

  private def checkSegments(expected: FileSegment, buffer: ManagedBuffer) {
    assert(buffer.isInstanceOf[FileSegmentManagedBuffer])
    val segment = buffer.asInstanceOf[FileSegmentManagedBuffer]
    assert(expected.file.getCanonicalPath === segment.getFile.getCanonicalPath)
    assert(expected.offset === segment.getOffset)
    assert(expected.length === segment.getLength)
  }

  test("consolidated shuffle can write to shuffle group without messing existing offsets/lengths") {

    val conf = new SparkConf(false)
    // reset after EACH object write. This is to ensure that there are bytes appended after
    // an object is written. So if the codepaths assume writeObject is end of data, this should
    // flush those bugs out. This was common bug in ExternalAppendOnlyMap, etc.
    conf.set("spark.serializer.objectStreamReset", "1")
    conf.set("spark.serializer", "org.apache.spark.serializer.JavaSerializer")
    conf.set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")

    sc = new SparkContext("local", "test", conf)

    val shuffleBlockManager =
      SparkEnv.get.shuffleManager.shuffleBlockManager.asInstanceOf[FileShuffleBlockManager]

    val shuffle1 = shuffleBlockManager.forMapTask(1, 1, 1, new JavaSerializer(conf),
      new ShuffleWriteMetrics)
    for (writer <- shuffle1.writers) {
      writer.write("test1")
      writer.write("test2")
    }
    for (writer <- shuffle1.writers) {
      writer.commitAndClose()
    }

    val shuffle1Segment = shuffle1.writers(0).fileSegment()
    shuffle1.releaseWriters(success = true)

    val shuffle2 = shuffleBlockManager.forMapTask(1, 2, 1, new JavaSerializer(conf),
      new ShuffleWriteMetrics)

    for (writer <- shuffle2.writers) {
      writer.write("test3")
      writer.write("test4")
    }
    for (writer <- shuffle2.writers) {
      writer.commitAndClose()
    }
    val shuffle2Segment = shuffle2.writers(0).fileSegment()
    shuffle2.releaseWriters(success = true)

    // Now comes the test :
    // Write to shuffle 3; and close it, but before registering it, check if the file lengths for
    // previous task (forof shuffle1) is the same as 'segments'. Earlier, we were inferring length
    // of block based on remaining data in file : which could mess things up when there is concurrent read
    // and writes happening to the same shuffle group.

    val shuffle3 = shuffleBlockManager.forMapTask(1, 3, 1, new JavaSerializer(testConf),
      new ShuffleWriteMetrics)
    for (writer <- shuffle3.writers) {
      writer.write("test3")
      writer.write("test4")
    }
    for (writer <- shuffle3.writers) {
      writer.commitAndClose()
    }
    // check before we register.
    checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0)))
    shuffle3.releaseWriters(success = true)
    checkSegments(shuffle2Segment, shuffleBlockManager.getBlockData(ShuffleBlockId(1, 2, 0)))
    shuffleBlockManager.removeShuffle(1)
  }

  def writeToFile(file: File, numBytes: Int) {
    val writer = new FileWriter(file, true)
    for (i <- 0 until numBytes) writer.write(i)
    writer.close()
  }
}

Source File: SortShuffleSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: SerializerFactory.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

package org.apache.spark.sql.execution.streaming.http

import java.nio.ByteBuffer
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.serializer.DeserializationStream
import org.apache.spark.serializer.SerializationStream
import java.io.OutputStream
import java.io.InputStream
import scala.reflect.ClassTag
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.spark.SparkConf
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.serializer.KryoSerializer


object SerializerFactory {
	val DEFAULT = new SerializerFactory {
		override def getSerializerInstance(serializerName: String): SerializerInstance = {
			serializerName.toLowerCase() match {
				case "kryo" ⇒
					new KryoSerializer(new SparkConf()).newInstance();
				case "java" ⇒
					new JavaSerializer(new SparkConf()).newInstance();
				case _ ⇒ throw new InvalidSerializerNameException(serializerName);
			}
		}
	}
}

trait SerializerFactory {
	def getSerializerInstance(serializerName: String): SerializerInstance;
}

Source File: HierarchyBuilderSuite.scala From HANAVora-Extensions with Apache License 2.0

5 votes

package org.apache.spark.sql.hierarchy

import org.apache.spark.SparkConf
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.sql.Row
import org.apache.spark.sql.types._
import org.apache.spark.sql.types.Node
import org.scalatest.FunSuite

class HierarchyBuilderSuite extends FunSuite {

  val N = 5
  val rowFunctions = HierarchyRowFunctions(Seq.fill(N)(StringType))

  test("HierarchyRowFunctions.rowGet") {
    for (i <- 0 to 5) {
      val row = Row((0 to 5).map(_.toString): _*)
      assertResult(i.toString)(rowFunctions.rowGet(i)(row))
    }
  }

  test("HierarchyRowFunctions.rowInit") {
    for (i <- 0 to 5) {
      val row = Row((0 to 5).map(_.toString): _*)

      val result = rowFunctions.rowInit(rowFunctions.rowGet(i), StringType)(row, None)
      val expected = Row(row.toSeq :+ Node(List(i.toString), StringType): _*)
      assertResult(expected)(result)
    }
  }

  // scalastyle:off magic.number
  test("HierarchyRowFunctions.rowInitWithOrder") {
    for (i <- 0 to 5) {
      val row = Row((0 to 5).map(_.toString): _*)
      val result = rowFunctions.rowInit(rowFunctions.rowGet(i), StringType)(row, Some(42L))
      val expected = Row(row.toSeq :+ Node(List(i.toString),StringType, ordPath = List(42L)): _*)
      assertResult(expected)(result)
    }
  }
  // scalastyle:on magic.number

  test("HierarchyRowFunctions.rowModify") {
    for (i <- 0 to 5) {
      val rightRow = Row(0 to 5: _*)
      val leftRow = Row("foo", 0, "bar", Node(List(0),StringType))
      val result = rowFunctions.rowModify(
        rowFunctions.rowGet(i),StringType
      )(leftRow, rightRow)
      val expected = Row((0 to 5) :+ Node(List(0, i), StringType): _*)
      assertResult(expected)(result)
    }
  }

  // scalastyle:off magic.number
  test("HierarchyRowFunctions.rowModifyAndOrder") {
    for (i <- 0 to 5) {
      val rightRow = Row(0 to 5: _*)
      val leftRow = Row("foo", 0, "bar", Node(List(0),StringType))
      val result = rowFunctions.rowModifyAndOrder(
        rowFunctions.rowGet(i), StringType
      )(leftRow, rightRow, Some(42L))
      val expected = Row((0 to 5) :+ Node(List(0, i), StringType, ordPath = List(42L)): _*)
      assertResult(expected)(result)
    }
  }
  // scalastyle:on magic.number

  test("HierarchyBuilder closure is serializable") {
    val closureSerializer = new JavaSerializer(new SparkConf(loadDefaults = false)).newInstance()
    val serialized = closureSerializer.serialize(() =>
      HierarchyJoinBuilder(null, null, null, null, null, null))
  }

  test("HierarchyRowFunctions closure is serializable") {
    val closureSerializer = new JavaSerializer(new SparkConf(loadDefaults = false)).newInstance()
    val serialized = closureSerializer.serialize(() =>
      HierarchyRowJoinBuilder(null, null, null, null))
  }

}

Source File: SerializationSuite.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.sql.test.SharedSQLContext

class SerializationSuite extends SparkFunSuite with SharedSQLContext {

  test("[SPARK-5235] SQLContext should be serializable") {
    val spark = SparkSession.builder.getOrCreate()
    new JavaSerializer(new SparkConf()).newInstance().serialize(spark.sqlContext)
  }

  test("[SPARK-26409] SQLConf should be serializable") {
    val spark = SparkSession.builder.getOrCreate()
    new JavaSerializer(new SparkConf()).newInstance().serialize(spark.sessionState.conf)
  }
}

Source File: HBasePartitioner.scala From Backup-Repo with Apache License 2.0

5 votes

package org.apache.spark.sql.hbase

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import org.apache.hadoop.hbase.util.Bytes
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.util.{CollectionsUtils, Utils}
import org.apache.spark.{Partitioner, SparkEnv}

object HBasePartitioner {
  implicit object HBaseRawOrdering extends Ordering[HBaseRawType] {
    def compare(a: HBaseRawType, b: HBaseRawType) = Bytes.compareTo(a, b)
  }
}

class HBasePartitioner (var splitKeys: Array[HBaseRawType]) extends Partitioner {
  import HBasePartitioner.HBaseRawOrdering

  type t = HBaseRawType

  lazy private val len = splitKeys.length

  // For pre-split table splitKeys(0) = bytes[0], to remove it,
  // otherwise partition 0 always be empty and
  // we will miss the last region's date when bulk load
  lazy private val realSplitKeys = if (splitKeys.isEmpty) splitKeys else splitKeys.tail

  def numPartitions = if (len == 0) 1 else len

  @transient private val binarySearch: ((Array[t], t) => Int) = CollectionsUtils.makeBinarySearch[t]

  def getPartition(key: Any): Int = {
    val k = key.asInstanceOf[t]
    var partition = 0
    if (len <= 128 && len > 0) {
      // If we have less than 128 partitions naive search
      val ordering = implicitly[Ordering[t]]
      while (partition < realSplitKeys.length && ordering.gt(k, realSplitKeys(partition))) {
        partition += 1
      }
    } else {
      // Determine which binary search method to use only once.
      partition = binarySearch(realSplitKeys, k)
      // binarySearch either returns the match location or -[insertion point]-1
      if (partition < 0) {
        partition = -partition - 1
      }
      if (partition > realSplitKeys.length) {
        partition = realSplitKeys.length
      }
    }
    partition
  }

  override def equals(other: Any): Boolean = other match {
    case r: HBasePartitioner =>
      r.splitKeys.sameElements(splitKeys)
    case _ =>
      false
  }

  override def hashCode(): Int = {
    val prime = 31
    var result = 1
    var i = 0
    while (i < splitKeys.length) {
      result = prime * result + splitKeys(i).hashCode
      i += 1
    }
    result = prime * result
    result
  }
}

Source File: HashMapParam.scala From incubator-s2graph with Apache License 2.0

5 votes

package org.apache.s2graph.spark.spark

import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.{AccumulableParam, SparkConf}

import scala.collection.mutable.{HashMap => MutableHashMap}


  def zero(initialValue: MapType): MapType = {
    val ser = new JavaSerializer(new SparkConf(false)).newInstance()
    val copy = ser.deserialize[MapType](ser.serialize(initialValue))
    copy.clear()
    copy
  }
}

object HashMapParam {
  def apply[K, V](op: (V, V) => V): HashMapParam[K, V] = {
    new HashMapParam[K, V](op)
  }
}

org.apache.spark.serializer.JavaSerializer Scala Examples