org.apache.spark.serializer.KryoSerializer Scala Example

Source File: SortShuffleSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    // Once 'spark.local.dir' is set, it is cached. Unless this is manually cleared
    // before/after a test, it could return the same directory even if this property
    // is configured.
    Utils.clearLocalRootDirs()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
      Utils.clearLocalRootDirs()
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: RawTextSender.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.{SparkConf, Logging}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
}

Source File: PythonBroadcastSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.api.python

import scala.io.Source

import java.io.{PrintWriter, File}

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: RawTextSender.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.{SparkConf, Logging}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      // scalastyle:off println
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      // scalastyle:on println
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    //解析使用模式匹配的参数
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    //多次重复输入数据以填充缓冲区
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
}

Source File: PythonBroadcastSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.api.python

import scala.io.Source

import java.io.{PrintWriter, File}

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: LabeledPointSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.serializer.KryoSerializer

class LabeledPointSuite extends SparkFunSuite {
  test("Kryo class register") {
    val conf = new SparkConf(false)
    conf.set("spark.kryo.registrationRequired", "true")

    val ser = new KryoSerializer(conf).newInstance()

    val labeled1 = LabeledPoint(1.0, Vectors.dense(Array(1.0, 2.0)))
    val labeled2 = LabeledPoint(1.0, Vectors.sparse(10, Array(5, 7), Array(1.0, 2.0)))

    Seq(labeled1, labeled2).foreach { l =>
      val l2 = ser.deserialize[LabeledPoint](ser.serialize(l))
      assert(l === l2)
    }
  }
}

Source File: InstanceSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.feature

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.ml.linalg.Vectors
import org.apache.spark.serializer.KryoSerializer

class InstanceSuite extends SparkFunSuite{
  test("Kryo class register") {
    val conf = new SparkConf(false)
    conf.set("spark.kryo.registrationRequired", "true")

    val ser = new KryoSerializer(conf).newInstance()

    val instance1 = Instance(19.0, 2.0, Vectors.dense(1.0, 7.0))
    val instance2 = Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse)
    Seq(instance1, instance2).foreach { i =>
      val i2 = ser.deserialize[Instance](ser.serialize(i))
      assert(i === i2)
    }

    val oInstance1 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0))
    val oInstance2 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0).toSparse)
    Seq(oInstance1, oInstance2).foreach { o =>
      val o2 = ser.deserialize[OffsetInstance](ser.serialize(o))
      assert(o === o2)
    }
  }
}

Source File: TreePointSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer

class TreePointSuite extends SparkFunSuite {
  test("Kryo class register") {
    val conf = new SparkConf(false)
    conf.set("spark.kryo.registrationRequired", "true")

    val ser = new KryoSerializer(conf).newInstance()

    val point = new TreePoint(1.0, Array(1, 2, 3))
    val point2 = ser.deserialize[TreePoint](ser.serialize(point))
    assert(point.label === point2.label)
    assert(point.binnedFeatures === point2.binnedFeatures)
  }
}

Source File: LabeledPointSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.regression

import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.serializer.KryoSerializer

class LabeledPointSuite extends SparkFunSuite {

  test("parse labeled points") {
    val points = Seq(
      LabeledPoint(1.0, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0))))
    points.foreach { p =>
      assert(p === LabeledPoint.parse(p.toString))
    }
  }

  test("parse labeled points with whitespaces") {
    val point = LabeledPoint.parse("(0.0, [1.0, 2.0])")
    assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0)))
  }

  test("parse labeled points with v0.9 format") {
    val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0")
    assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0)))
  }

  test("conversions between new ml LabeledPoint and mllib LabeledPoint") {
    val points: Seq[LabeledPoint] = Seq(
      LabeledPoint(1.0, Vectors.dense(1.0, 0.0)),
      LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0))))

    val newPoints: Seq[NewLabeledPoint] = points.map(_.asML)

    points.zip(newPoints).foreach { case (p1, p2) =>
      assert(p1 === LabeledPoint.fromML(p2))
    }
  }

  test("Kryo class register") {
    val conf = new SparkConf(false)
    conf.set("spark.kryo.registrationRequired", "true")

    val ser = new KryoSerializer(conf).newInstance()

    val labeled1 = LabeledPoint(1.0, Vectors.dense(Array(1.0, 2.0)))
    val labeled2 = LabeledPoint(1.0, Vectors.sparse(10, Array(5, 7), Array(1.0, 2.0)))

    Seq(labeled1, labeled2).foreach { l =>
      val l2 = ser.deserialize[LabeledPoint](ser.serialize(l))
      assert(l === l2)
    }
  }
}

Source File: RawTextSender.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      // scalastyle:off println
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      // scalastyle:on println
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
}

Source File: PythonBroadcastSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.api.python

import java.io.{File, PrintWriter}

import scala.io.Source

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: StoragePerfTester.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.tools

import java.util.concurrent.{CountDownLatch, Executors}
import java.util.concurrent.atomic.AtomicLong

import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.shuffle.hash.HashShuffleManager
import org.apache.spark.util.Utils


    val numOutputSplits = sys.env.get("NUM_REDUCERS").map(_.toInt).getOrElse(500)

    val recordLength = 1000 // ~1KB records
    val totalRecords = dataSizeMb * 1000
    val recordsPerMap = totalRecords / numMaps

    val writeKey = "1" * (recordLength / 2)
    val writeValue = "1" * (recordLength / 2)
    val executor = Executors.newFixedThreadPool(numMaps)

    val conf = new SparkConf()
      .set("spark.shuffle.compress", "false")
      .set("spark.shuffle.sync", "true")
      .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")

    // This is only used to instantiate a BlockManager. All thread scheduling is done manually.
    val sc = new SparkContext("local[4]", "Write Tester", conf)
    val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager]

    def writeOutputBytes(mapId: Int, total: AtomicLong): Unit = {
      val shuffle = hashShuffleManager.shuffleBlockResolver.forMapTask(1, mapId, numOutputSplits,
        new KryoSerializer(sc.conf), new ShuffleWriteMetrics())
      val writers = shuffle.writers
      for (i <- 1 to recordsPerMap) {
        writers(i % numOutputSplits).write(writeKey, writeValue)
      }
      writers.map { w =>
        w.commitAndClose()
        total.addAndGet(w.fileSegment().length)
      }

      shuffle.releaseWriters(true)
    }

    val start = System.currentTimeMillis()
    val latch = new CountDownLatch(numMaps)
    val totalBytes = new AtomicLong()
    for (task <- 1 to numMaps) {
      executor.submit(new Runnable() {
        override def run(): Unit = {
          try {
            writeOutputBytes(task, totalBytes)
            latch.countDown()
          } catch {
            case e: Exception =>
              println("Exception in child thread: " + e + " " + e.getMessage)
              System.exit(1)
          }
        }
      })
    }
    latch.await()
    val end = System.currentTimeMillis()
    val time = (end - start) / 1000.0
    val bytesPerSecond = totalBytes.get() / time
    val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong

    System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits))
    System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile)))
    System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong)))

    executor.shutdown()
    sc.stop()
  }
}

Source File: SparkSqlSerializer.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import java.nio.ByteBuffer
import java.util.{HashMap => JavaHashMap}

import scala.reflect.ClassTag
import com.esotericsoftware.kryo.io.{Input, Output}
import com.esotericsoftware.kryo.{Kryo, Serializer}
import com.twitter.chill.ResourcePool
import org.apache.spark.serializer.{KryoSerializer, SerializerInstance}
import org.apache.spark.sql.types.{Decimal, StructField, StructType}
import org.apache.spark.util.MutablePair
import org.apache.spark.{SparkConf, SparkEnv}


//private[sql]
class SparkSqlSerializer(conf: SparkConf) extends KryoSerializer(conf) {
  override def newKryo(): Kryo = {
    val kryo = super.newKryo()
    kryo.setRegistrationRequired(false)
    kryo.register(classOf[MutablePair[_, _]])
    kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericRow])
    kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericInternalRow])
    kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericMutableRow])
    kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer)
    kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer)

    kryo.register(classOf[Decimal])
    kryo.register(classOf[JavaHashMap[_, _]])

    // APS
    kryo.register(classOf[StructType])
    kryo.register(classOf[StructField])

    kryo.setReferences(false)
    kryo
  }
}

private[execution] class KryoResourcePool(size: Int)
  extends ResourcePool[SerializerInstance](size) {

  val ser: SparkSqlSerializer = {
    val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
    new SparkSqlSerializer(sparkConf)
  }

  def newInstance(): SerializerInstance = ser.newInstance()
}

//private[sql]
object SparkSqlSerializer {
  @transient lazy val resourcePool = new KryoResourcePool(30)

  private[this] def acquireRelease[O](fn: SerializerInstance => O): O = {
    val kryo = resourcePool.borrow
    try {
      fn(kryo)
    } finally {
      resourcePool.release(kryo)
    }
  }

  def serialize[T: ClassTag](o: T): Array[Byte] =
    acquireRelease { k =>
      k.serialize(o).array()
    }

  def deserialize[T: ClassTag](bytes: Array[Byte]): T =
    acquireRelease { k =>
      k.deserialize[T](ByteBuffer.wrap(bytes))
    }
}

private[sql] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] {
  def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) {
    // TODO: There are probably more efficient representations than strings...
    output.writeString(bd.toString)
  }

  def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = {
    new java.math.BigDecimal(input.readString())
  }
}

private[sql] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] {
  def write(kryo: Kryo, output: Output, bd: BigDecimal) {
    // TODO: There are probably more efficient representations than strings...
    output.writeString(bd.toString)
  }

  def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = {
    new java.math.BigDecimal(input.readString())
  }
}

Source File: StoragePerfTester.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.tools

import java.util.concurrent.{CountDownLatch, Executors}
import java.util.concurrent.atomic.AtomicLong

import org.apache.spark.executor.ShuffleWriteMetrics
import org.apache.spark.{SparkConf, SparkContext}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.shuffle.hash.HashShuffleManager
import org.apache.spark.util.Utils


    val numOutputSplits = sys.env.get("NUM_REDUCERS").map(_.toInt).getOrElse(500)

    val recordLength = 1000 // ~1KB records
    val totalRecords = dataSizeMb * 1000
    val recordsPerMap = totalRecords / numMaps

    val writeKey = "1" * (recordLength / 2)
    val writeValue = "1" * (recordLength / 2)
    val executor = Executors.newFixedThreadPool(numMaps)

    val conf = new SparkConf()
      .set("spark.shuffle.compress", "false")
      .set("spark.shuffle.sync", "true")
      .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager")

    // This is only used to instantiate a BlockManager. All thread scheduling is done manually.
    val sc = new SparkContext("local[4]", "Write Tester", conf)
    val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager]

    def writeOutputBytes(mapId: Int, total: AtomicLong): Unit = {
      val shuffle = hashShuffleManager.shuffleBlockResolver.forMapTask(1, mapId, numOutputSplits,
        new KryoSerializer(sc.conf), new ShuffleWriteMetrics())
      val writers = shuffle.writers
      for (i <- 1 to recordsPerMap) {
        writers(i % numOutputSplits).write(writeKey, writeValue)
      }
      writers.map { w =>
        w.commitAndClose()
        total.addAndGet(w.fileSegment().length)
      }

      shuffle.releaseWriters(true)
    }

    val start = System.currentTimeMillis()
    val latch = new CountDownLatch(numMaps)
    val totalBytes = new AtomicLong()
    for (task <- 1 to numMaps) {
      executor.submit(new Runnable() {
        override def run(): Unit = {
          try {
            writeOutputBytes(task, totalBytes)
            latch.countDown()
          } catch {
            case e: Exception =>
              // scalastyle:off println
              println("Exception in child thread: " + e + " " + e.getMessage)
              // scalastyle:on println
              System.exit(1)
          }
        }
      })
    }
    latch.await()
    val end = System.currentTimeMillis()
    val time = (end - start) / 1000.0
    val bytesPerSecond = totalBytes.get() / time
    val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong

    // scalastyle:off println
    System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits))
    System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile)))
    System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong)))
    // scalastyle:on println

    executor.shutdown()
    sc.stop()
  }
}

Source File: RawTextSender.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.{SparkConf, Logging}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      // scalastyle:off println
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      // scalastyle:on println
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
}

Source File: PythonBroadcastSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.api.python

import scala.io.Source

import java.io.{PrintWriter, File}

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SortShuffleSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: ReadingWritingData.scala From Spark-RSVD with Apache License 2.0

5 votes

package com.criteo.rsvd

import java.nio.ByteBuffer

import com.esotericsoftware.kryo.Kryo
import com.typesafe.scalalogging.slf4j.StrictLogging
import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer
import org.apache.hadoop.fs.{FileSystem, Path}
import org.apache.hadoop.io.{BytesWritable, NullWritable}
import org.apache.spark.mllib.linalg.distributed.MatrixEntry
import org.apache.spark.rdd.RDD
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import org.apache.spark.{SparkConf, SparkContext}

import scala.reflect.ClassTag

object ReadingWritingData extends StrictLogging {

  def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = {
    val fs = FileSystem.get(sc.hadoopConfiguration)
    val path = new Path(inputPathPattern)
    (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt
  }

  def loadMatrixEntries(inputPath: String,
                        singlePartitionSizeMB: Int,
                        sc: SparkContext): RDD[MatrixEntry] = {

    logger.info(s"Input matrix path: $inputPath")
    val inputDataSizeMB = getInputDataSizeMB(inputPath + "
  def makeRddFromKryoFile[T: ClassTag](
      sc: SparkContext,
      path: String,
      minPartitionsOpt: Option[Int] = None): RDD[T] = {
    val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions)
    val serializer = new KryoSerializer(sc.getConf)
    sc.sequenceFile(path,
                    classOf[NullWritable],
                    classOf[BytesWritable],
                    minPartitions)
      .mapPartitions { it =>
        val instance = serializer.newInstance()
        it.flatMap {
          case (_, v) =>
            instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes))
        }
      }
  }

  object RandomizedSVDKryoRegistrator extends KryoRegistrator {

    def registerClasses(kryo: Kryo): Unit = {
      UnmodifiableCollectionsSerializer.registerSerializers(kryo)
      kryo.register(classOf[MatrixEntry])
      kryo.register(classOf[Array[MatrixEntry]])
    }
  }

  def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf =
    appendRegistratorToSparkConf(sparkConf,
                                 RandomizedSVDKryoRegistrator.getClass.getName)

  def appendRegistratorToSparkConf(sparkConf: SparkConf,
                                   registratorName: String): SparkConf = {
    val oldValue = sparkConf.get("spark.kryo.registrator", "")
    if (oldValue == "") {
      sparkConf.set("spark.kryo.registrator", registratorName)
    } else {
      sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName)
    }
  }

}

Source File: LabeledPointSuite.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.feature

import org.apache.spark.linalg.Vectors
import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer

class LabeledPointSuite extends SparkFunSuite {

  test("Kryo class register") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.registerKryoClasses(
      Array(classOf[scala.collection.mutable.WrappedArray.ofRef[_]],
        classOf[LabeledPoint]))
//    conf.set("spark.kryo.registrationRequired", "true")

    val ser = new KryoSerializer(conf).newInstance()

    val labeled1 = LabeledPoint(1.0, Vectors.dense(Array(1.0, 2.0)))
    val labeled2 = LabeledPoint(1.0, Vectors.sparse(10, Array(5, 7), Array(1.0, 2.0)))

    Seq(labeled1, labeled2).foreach { l =>
      val l2 = ser.deserialize[LabeledPoint](ser.serialize(l))
      assert(l === l2)
    }
  }
}

Source File: InstanceSuite.scala From sona with Apache License 2.0

5 votes

package com.tencent.angel.sona.ml.feature

import org.apache.spark.linalg.Vectors
import org.apache.spark.{SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer

class InstanceSuite extends SparkFunSuite{

  test("Kryo class register") {
    val conf = new SparkConf(false)
    conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")
    conf.registerKryoClasses(
      Array(classOf[scala.collection.mutable.WrappedArray.ofRef[_]],
        classOf[Instance]))
//    conf.set("spark.kryo.registrationRequired", "true")

    val ser = new KryoSerializer(conf).newInstance()

    val instance1 = Instance(19.0, 2.0, Vectors.dense(1.0, 7.0))
    val instance2 = Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse)
    Seq(instance1, instance2).foreach { i =>
      val i2 = ser.deserialize[Instance](ser.serialize(i))
      assert(i === i2)
    }

    val oInstance1 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0))
    val oInstance2 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0).toSparse)
    Seq(oInstance1, oInstance2).foreach { o =>
      val o2 = ser.deserialize[OffsetInstance](ser.serialize(o))
      assert(o === o2)
    }
  }
}

Source File: ByteRangesTest.scala From spark-bam with Apache License 2.0

5 votes

package org.hammerlab.args

import hammerlab.bytes._
import org.apache.spark.serializer.KryoSerializer
import org.hammerlab.spark.confs
import org.hammerlab.spark.test.serde.KryoSerialization.kryoBytes
import org.hammerlab.spark.test.suite.SparkSuite
import org.hammerlab.test.serde.JavaSerialization._

class ByteRangesTest
  extends SparkSuite
    with confs.Kryo {

  override def registrationRequired: Boolean = false

  test("kryo serialization") {
    val ks = new KryoSerializer(sc.getConf)
    implicit val kryo = ks.newKryo()

    val bytes = kryoBytes(ByteRanges(Seq(Endpoints(10.MB, 20.MB))))
    bytes.length should be(69)
  }

  test("java serde") {
    val byteRanges = ByteRanges(Seq(Endpoints(10.MB, 20.MB)))
    javaRead[ByteRanges](javaBytes(byteRanges)) should be(byteRanges)
  }
}

Source File: ConfsTest.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.spark

import org.apache.spark.serializer.KryoSerializer

import scala.collection.mutable

class ConfsTest
  extends ContextSuite
    with confs.DynamicAllocation
    with confs.EventLog
    with confs.Speculation {

  val eventLogDir = tmpDir().toString

  sparkConf(
    "spark.eventLog.dir" → eventLogDir
  )

  register(
    classOf[Array[String]],
    classOf[mutable.WrappedArray.ofRef[_]],
    classOf[Foo]
  )

  test("make SparkContext") {
    conf.get("spark.serializer") should be(classOf[KryoSerializer].getCanonicalName)
    conf.get("spark.dynamicAllocation.enabled") should be("true")
    conf.get("spark.eventLog.enabled") should be("true")
    conf.get("spark.eventLog.dir") should be(eventLogDir)
    conf.get("spark.speculation") should be("true")

    val strings = Array("a", "b", "c", "d")
    val fooBroadcast = sc.broadcast(Foo("x"))
    val rdd = sc.parallelize(strings)
    rdd.count should be(4)
    rdd.map(_ + fooBroadcast.value.s).collect should be(Array("ax", "bx", "cx", "dx"))
  }
}

case class Foo(s: String)

Source File: PythonBroadcastSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.api.python

import java.io.{File, PrintWriter}

import scala.io.Source

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SortShuffleSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: ShapeLuceneRDDKryoRegistrator.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd.spatial.shape

import com.twitter.algebird.TopK
import com.twitter.chill.Kryo
import org.apache.spark.SparkConf
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types._
import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc}
import org.zouzias.spark.lucenerdd.spatial.shape.partition.ShapeLuceneRDDPartition


class ShapeLuceneRDDKryoRegistrator extends KryoRegistrator {
  def registerClasses(kryo: Kryo): Unit = {
    kryo.register(classOf[ShapeLuceneRDD[_, _]])
    kryo.register(classOf[ShapeLuceneRDDPartition[_, _]])
    kryo.register(classOf[Number])
    kryo.register(classOf[java.lang.Double])
    kryo.register(classOf[java.lang.Float])
    kryo.register(classOf[java.lang.Integer])
    kryo.register(classOf[java.lang.Long])
    kryo.register(classOf[java.lang.Short])
    kryo.register(classOf[StructType])
    kryo.register(classOf[StructField])
    kryo.register(classOf[IntegerType])
    kryo.register(classOf[IntegerType$])
    kryo.register(classOf[DoubleType])
    kryo.register(classOf[DoubleType$])
    kryo.register(classOf[FloatType])
    kryo.register(classOf[StringType])
    kryo.register(classOf[StringType$])
    kryo.register(classOf[GenericRowWithSchema])
    kryo.register(classOf[Metadata])
    kryo.register(classOf[Object])
    kryo.register(classOf[Array[Object]])
    kryo.register(classOf[Array[Array[Byte]]])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofRef])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofFloat])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofDouble])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofInt])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofLong])
    kryo.register(classOf[Array[String]])
    kryo.register(classOf[Array[Number]])
    kryo.register(classOf[Array[Float]])
    kryo.register(classOf[Array[Int]])
    kryo.register(classOf[Array[Long]])
    kryo.register(classOf[Array[Double]])
    kryo.register(classOf[Array[Boolean]])
    kryo.register(classOf[Array[SparkScoreDoc]])
    kryo.register(classOf[Array[StructType]])
    kryo.register(classOf[Array[StructField]])
    kryo.register(classOf[Range])
    kryo.register(classOf[scala.collection.immutable.Map[String, String]])
    kryo.register(classOf[scala.collection.immutable.Map[String, Number]])
    kryo.register(classOf[scala.collection.immutable.Map$EmptyMap$])
    kryo.register(classOf[scala.collection.immutable.Set$EmptySet$])
    kryo.register(classOf[scala.collection.immutable.Map[_, _]])
    kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]])
    kryo.register(classOf[SparkFacetResult])
    kryo.register(classOf[SparkScoreDoc])
    kryo.register(classOf[TopK[_]])

    ()
  }
}


  }
}

Source File: LuceneRDDKryoRegistrator.scala From spark-lucenerdd with Apache License 2.0

5 votes

package org.zouzias.spark.lucenerdd

import com.twitter.algebird.TopK
import com.twitter.chill.Kryo
import org.apache.spark.SparkConf
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD
import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc}
import org.zouzias.spark.lucenerdd.partition.LuceneRDDPartition
import org.zouzias.spark.lucenerdd.response.{LuceneRDDResponse, LuceneRDDResponsePartition}
import org.zouzias.spark.lucenerdd.testing.{FavoriteCaseClass, Person}

class LuceneRDDKryoRegistrator extends KryoRegistrator {
  def registerClasses(kryo: Kryo): Unit = {
    kryo.register(classOf[LuceneRDD[_]])
    kryo.register(classOf[LuceneRDDPartition[_]])
    kryo.register(classOf[FacetedLuceneRDD[_]])
    kryo.register(classOf[Number])
    kryo.register(classOf[java.lang.Double])
    kryo.register(classOf[java.lang.Float])
    kryo.register(classOf[java.lang.Integer])
    kryo.register(classOf[java.lang.Long])
    kryo.register(classOf[java.lang.Short])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofRef])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofFloat])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofDouble])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofInt])
    kryo.register(classOf[scala.collection.mutable.WrappedArray$ofLong])
    kryo.register(classOf[Array[String]])
    kryo.register(classOf[Array[Number]])
    kryo.register(classOf[Array[Float]])
    kryo.register(classOf[Array[Int]])
    kryo.register(classOf[Array[Long]])
    kryo.register(classOf[Array[Double]])
    kryo.register(classOf[Array[Boolean]])
    kryo.register(classOf[Range])
    kryo.register(classOf[scala.collection.immutable.Map[String, String]])
    kryo.register(classOf[scala.collection.immutable.Map[String, Number]])
    kryo.register(classOf[scala.collection.immutable.Map$EmptyMap$])
    kryo.register(classOf[scala.collection.immutable.Set$EmptySet$])
    kryo.register(classOf[scala.collection.immutable.Map[_, _]])
    kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]])
    kryo.register(classOf[SparkFacetResult])
    kryo.register(classOf[SparkScoreDoc])
    kryo.register(classOf[LuceneRDDResponse])
    kryo.register(classOf[LuceneRDDResponsePartition])
    kryo.register(classOf[TopK[_]])
    kryo.register(classOf[FavoriteCaseClass]) 
  }
}

Source File: MatfastSerializer.scala From MatRel with Apache License 2.0

5 votes

package org.apache.spark.sql.matfast.util

import java.math.BigDecimal
import java.nio.ByteBuffer
import java.util.{HashMap => JavaHashMap}

import scala.reflect.ClassTag

import com.esotericsoftware.kryo.{Kryo, Serializer}
import com.esotericsoftware.kryo.io.{Input, Output}
import com.twitter.chill.ResourcePool

import org.apache.spark.{SparkConf, SparkEnv}
import org.apache.spark.serializer.{KryoSerializer, SerializerInstance}
import org.apache.spark.sql.matfast.matrix._
import org.apache.spark.sql.types.Decimal
import org.apache.spark.util.MutablePair


private[matfast] class MatfastSerializer(conf: SparkConf) extends KryoSerializer(conf) {
  override def newKryo(): Kryo = {
    val kryo = super.newKryo()
    kryo.setRegistrationRequired(false)
    kryo.register(classOf[MutablePair[_, _]])
    kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericRow])
    kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericInternalRow])
    kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer)
    kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer)

    kryo.register(classOf[Decimal])
    kryo.register(classOf[JavaHashMap[_, _]])
    kryo.register(classOf[DenseMatrix])
    kryo.register(classOf[SparseMatrix])

    kryo.setReferences(false)
    kryo
  }
}

private[matfast] class KryoResourcePool(size: Int) extends ResourcePool[SerializerInstance](size) {
  val ser: MatfastSerializer = {
    val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf())
    new MatfastSerializer(sparkConf)
  }

  def newInstance(): SerializerInstance = ser.newInstance()
}

private[matfast] object MatfastSerializer {
  @transient lazy val resourcePool = new KryoResourcePool(50)

  private[this] def acquireRelease[O](fn: SerializerInstance => O): O = {
    val kryo = resourcePool.borrow()
    try {
      fn(kryo)
    } finally {
      resourcePool.release(kryo)
    }
  }

  def serialize[T: ClassTag](o: T): Array[Byte] = {
    acquireRelease { k =>
      k.serialize(o).array()
    }
  }

  def deserialize[T: ClassTag](bytes: Array[Byte]): T =
    acquireRelease { k =>
      k.deserialize[T](ByteBuffer.wrap(bytes))
    }
}

private[matfast] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] {
  def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) {
    output.writeString(bd.toString)
  }

  def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = {
    new java.math.BigDecimal(input.readString())
  }
}

private[matfast] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] {
  def write(kryo: Kryo, output: Output, bd: BigDecimal): Unit = {
    output.writeString(bd.toString)
  }

  def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = {
    new java.math.BigDecimal(input.readString())
  }
}

Source File: utils.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

package org.apache.spark.sql.execution.streaming.http

import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.TimestampType
import org.apache.spark.SparkConf
import org.apache.commons.io.IOUtils
import org.apache.spark.serializer.KryoSerializer
import java.io.InputStream
import com.esotericsoftware.kryo.io.Input
import java.io.ByteArrayOutputStream

class WrongArgumentException(name: String, value: Any)
		extends RuntimeException(s"wrong argument: $name=$value") {
}

class MissingRequiredArgumentException(map: Map[String, String], paramName: String)
		extends RuntimeException(s"missing required argument: $paramName, all parameters=$map") {
}

class InvalidSerializerNameException(serializerName: String)
		extends RuntimeException(s"invalid serializer name: $serializerName") {
}

object SchemaUtils {
	def buildSchema(schema: StructType, includesTimestamp: Boolean, timestampColumnName: String = "_TIMESTAMP_"): StructType = {
		if (!includesTimestamp)
			schema;
		else
			StructType(schema.fields.toSeq :+ StructField(timestampColumnName, TimestampType, false));
	}
}

object Params {
	
	def deserialize(bytes: Array[Byte]): Any = {
		val kryo = kryoSerializer.newKryo();
		val input = new Input();
		input.setBuffer(bytes);
		kryo.readClassAndObject(input);
	}
}

Source File: SerializerFactory.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

package org.apache.spark.sql.execution.streaming.http

import java.nio.ByteBuffer
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.serializer.DeserializationStream
import org.apache.spark.serializer.SerializationStream
import java.io.OutputStream
import java.io.InputStream
import scala.reflect.ClassTag
import com.fasterxml.jackson.databind.ObjectMapper
import org.apache.spark.SparkConf
import org.apache.spark.serializer.JavaSerializer
import org.apache.spark.serializer.KryoSerializer


object SerializerFactory {
	val DEFAULT = new SerializerFactory {
		override def getSerializerInstance(serializerName: String): SerializerInstance = {
			serializerName.toLowerCase() match {
				case "kryo" ⇒
					new KryoSerializer(new SparkConf()).newInstance();
				case "java" ⇒
					new JavaSerializer(new SparkConf()).newInstance();
				case _ ⇒ throw new InvalidSerializerNameException(serializerName);
			}
		}
	}
}

trait SerializerFactory {
	def getSerializerInstance(serializerName: String): SerializerInstance;
}

Source File: UtilsTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

import java.sql.Date

import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.SparkSession
import org.junit.Assert
import org.junit.Test
import java.io.ByteArrayOutputStream
import java.io.InputStream
import org.apache.commons.io.IOUtils
import com.esotericsoftware.kryo.io.Input
import org.apache.spark.sql.execution.streaming.http.KryoSerializerUtils

class UtilsTest {
	@Test
	def testKryoSerDe() {
		val d1 = new Date(30000);
		val bytes = KryoSerializerUtils.serialize(d1);
		val d2 = KryoSerializerUtils.deserialize(bytes);
		Assert.assertEquals(d1, d2);

		val d3 = Map('x' -> Array("aaa", "bbb"), 'y' -> Array("ccc", "ddd"));
		println(d3);
		val bytes2 = KryoSerializerUtils.serialize(d3);
		val d4 = KryoSerializerUtils.deserialize(bytes2).asInstanceOf[Map[String, Any]];
		println(d4);
	}

	@Test
	def testEncoderSchema() {
		val spark = SparkSession.builder.master("local[4]")
			.getOrCreate();
		val sqlContext = spark.sqlContext;
		import sqlContext.implicits._
		import org.apache.spark.sql.catalyst.encoders.encoderFor
		val schema1 = encoderFor[String].schema;
		val schema2 = encoderFor[(String)].schema;
		val schema3 = encoderFor[((String))].schema;

		Assert.assertEquals(schema1, schema2);
		Assert.assertEquals(schema1, schema3);
	}

	@Test
	def testDateInTuple() {
		val spark = SparkSession.builder.master("local[4]")
			.getOrCreate();
		val sqlContext = spark.sqlContext;
		import sqlContext.implicits._

		val d1 = new Date(30000);
		val ds = sqlContext.createDataset(Seq[(Int, Date)]((1, d1)));
		val d2 = ds.collect()(0)._2;

		//NOTE: d1!=d2, maybe a bug
		println(d1.equals(d2));
	}
}

Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License

5 votes

import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql.Row
import org.apache.spark.sql.execution.streaming.http.HttpStreamClient
import org.junit.Assert
import org.junit.Test
import org.apache.spark.sql.types.LongType
import org.apache.spark.sql.types.IntegerType
import org.apache.spark.sql.types.DoubleType
import org.apache.spark.sql.types.BooleanType
import org.apache.spark.sql.types.FloatType
import org.apache.spark.sql.types.StringType
import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema
import org.apache.spark.sql.types.StructType
import org.apache.spark.sql.types.StructField
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.types.ByteType
import org.apache.spark.sql.execution.streaming.http.HttpStreamServer
import org.apache.spark.sql.execution.streaming.http.StreamPrinter
import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException


class HttpStreamServerClientTest {
	val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte),
		Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte),
		Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte));

	val ROWS2 = Array(Row("hello"),
		Row("world"),
		Row("bye"),
		Row("world"));

	@Test
	def testHttpStreamIO() {
		//starts a http server
		val kryoSerializer = new KryoSerializer(new SparkConf());
		val server = HttpStreamServer.start("/xxxx", 8080);

		val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]")
			.getOrCreate();
		spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/");

		val sqlContext = spark.sqlContext;
		import spark.implicits._
		//add a local message buffer to server, with 2 topics registered
		server.withBuffer()
			.addListener(new StreamPrinter())
			.createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1")
			.createTopic[String]("topic-2");

		val client = HttpStreamClient.connect("http://localhost:8080/xxxx");
		//tests schema of topics
		val schema1 = client.fetchSchema("topic-1");
		Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType),
			schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		val schema2 = client.fetchSchema("topic-2");
		Assert.assertArrayEquals(Array[Object](StringType),
			schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]);

		//prepare to consume messages
		val sid1 = client.subscribe("topic-1")._1;
		val sid2 = client.subscribe("topic-2")._1;

		//produces some data
		client.sendRows("topic-1", 1, ROWS1);

		val sid4 = client.subscribe("topic-1")._1;
		val sid5 = client.subscribe("topic-2")._1;

		client.sendRows("topic-2", 1, ROWS2);

		//consumes data
		val fetched = client.fetchStream(sid1).map(_.originalRow);
		Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]);
		//it is empty now
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);
		Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]);

		client.unsubscribe(sid4);
		try {
			client.fetchStream(sid4);
			//exception should be thrown, because subscriber id is invalidated
			Assert.assertTrue(false);
		}
		catch {
			case e: Throwable ⇒
				e.printStackTrace();
				Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass);
		}

		server.stop();
	}
}

Source File: Kryo.scala From spark-util with Apache License 2.0

5 votes

package org.hammerlab.spark.confs

import org.apache.spark.serializer.{ KryoRegistrator, KryoSerializer }
import org.hammerlab.kryo.Registrar
import org.hammerlab.spark.SparkConfBase

import scala.reflect.ClassTag

case class UserRegistrar(name: String)

object UserRegistrar {

  implicit def fromInstance[T <: KryoRegistrator](t: T): UserRegistrar =
    UserRegistrar(t.getClass.getName)

  implicit def romClass[T <: KryoRegistrator](cls: Class[T]): UserRegistrar =
    UserRegistrar(cls.getName)

  implicit def fromClassTag[T <: KryoRegistrator](implicit ct: ClassTag[T]): UserRegistrar =
    UserRegistrar(ct.runtimeClass.getName)
}

trait Kryo
  extends SparkConfBase
    with Registrar {

  def registrationRequired: Boolean = true
  def referenceTracking: Boolean = false

  def registrar(userRegistrar: UserRegistrar): Unit =
    sparkConf(
      "spark.kryo.registrator" → userRegistrar.name
    )

  def registrar[T <: KryoRegistrator](implicit ct: ClassTag[T]): Unit =
    registrar(UserRegistrar.fromClassTag(ct))

  sparkConf(
    "spark.serializer" → classOf[KryoSerializer].getName,
    "spark.kryo.referenceTracking" → referenceTracking.toString,
    "spark.kryo.registrationRequired" → registrationRequired.toString
  )
}

Source File: RawTextSender.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      // scalastyle:off println
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      // scalastyle:on println
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
}

Source File: RawTextSender.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      // scalastyle:off println
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      // scalastyle:on println
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
}

Source File: PythonBroadcastSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.python

import java.io.{File, PrintWriter}

import scala.io.Source

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SortShuffleSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

Source File: PythonBroadcastSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.api.python

import scala.io.Source

import java.io.{PrintWriter, File}

import org.scalatest.{Matchers, FunSuite}

import org.apache.spark.{SharedSparkContext, SparkConf}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: TimeSeriesKryoRegistrator.scala From spark-timeseries with Apache License 2.0

5 votes

package com.cloudera.sparkts

import com.esotericsoftware.kryo.{Serializer, Kryo}
import com.esotericsoftware.kryo.io.{Output, Input}

import org.apache.spark.SparkConf
import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer}
import com.cloudera.sparkts.TimeSeriesUtils._

import java.time._

class TimeSeriesKryoRegistrator extends KryoRegistrator {
  def registerClasses(kryo: Kryo): Unit = {
    kryo.register(classOf[TimeSeries[_]])
    kryo.register(classOf[UniformDateTimeIndex])
    kryo.register(classOf[IrregularDateTimeIndex])
    kryo.register(classOf[BusinessDayFrequency])
    kryo.register(classOf[DayFrequency])
    kryo.register(classOf[ZonedDateTime], new DateTimeSerializer)
  }
}

class DateTimeSerializer extends Serializer[ZonedDateTime] {
  def write(kryo: Kryo, out: Output, dt: ZonedDateTime): Unit = {
    out.writeLong(zonedDateTimeToLong(dt), true)
  }

  def read(kryo: Kryo, in: Input, clazz: Class[ZonedDateTime]): ZonedDateTime = {
    longToZonedDateTime(in.readLong(true), ZoneId.systemDefault())
  }
}

object TimeSeriesKryoRegistrator {
  def registerKryoClasses(conf: SparkConf): Unit = {
    conf.set("spark.serializer", classOf[KryoSerializer].getName)
    conf.set("spark.kryo.registrator", classOf[TimeSeriesKryoRegistrator].getName)
  }
}

Source File: Analytics.scala From osmesa with Apache License 2.0

5 votes

package osmesa.analytics

import geotrellis.spark.io.kryo.KryoRegistrator
import org.apache.spark.SparkConf
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.sql._
import org.locationtech.geomesa.spark.jts._

object Analytics {
  def sparkSession(appName: String): SparkSession = {
    val conf = new SparkConf()
      .setIfMissing("spark.master", "local[*]")
      .setAppName(s"OSMesa Analytics - ${appName}")
      .set("spark.sql.orc.impl", "native")
      .set("spark.sql.orc.filterPushdown", "true")
      .set("spark.sql.parquet.mergeSchema", "false")
      .set("spark.sql.parquet.filterPushdown", "true")
      .set("spark.sql.hive.metastorePartitionPruning", "true")
      .set("spark.ui.showConsoleProgress", "true")
      .set("spark.serializer", classOf[KryoSerializer].getName)
      .set("spark.kryo.registrator", classOf[KryoRegistrator].getName)

    SparkSession.builder
      .config(conf)
      .enableHiveSupport
      .getOrCreate
      .withJTS
  }
}

Source File: RawTextSender.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.util

import java.io.{ByteArrayOutputStream, IOException}
import java.net.ServerSocket
import java.nio.ByteBuffer

import scala.io.Source

import org.apache.spark.SparkConf
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.IntParam


private[streaming]
object RawTextSender extends Logging {
  def main(args: Array[String]) {
    if (args.length != 4) {
      // scalastyle:off println
      System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>")
      // scalastyle:on println
      System.exit(1)
    }
    // Parse the arguments using a pattern match
    val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args

    // Repeat the input data multiple times to fill in a buffer
    val lines = Source.fromFile(file).getLines().toArray
    val bufferStream = new ByteArrayOutputStream(blockSize + 1000)
    val ser = new KryoSerializer(new SparkConf()).newInstance()
    val serStream = ser.serializeStream(bufferStream)
    var i = 0
    while (bufferStream.size < blockSize) {
      serStream.writeObject(lines(i))
      i = (i + 1) % lines.length
    }
    val array = bufferStream.toByteArray

    val countBuf = ByteBuffer.wrap(new Array[Byte](4))
    countBuf.putInt(array.length)
    countBuf.flip()

    val serverSocket = new ServerSocket(port)
    logInfo("Listening on port " + port)

    while (true) {
      val socket = serverSocket.accept()
      logInfo("Got a new connection")
      val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec)
      try {
        while (true) {
          out.write(countBuf.array)
          out.write(array)
        }
      } catch {
        case e: IOException =>
          logError("Client disconnected")
      } finally {
        socket.close()
      }
    }
  }
}

Source File: PythonBroadcastSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.python

import java.io.{File, PrintWriter}

import scala.io.Source

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SortShuffleSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark

import java.io.File

import scala.collection.JavaConverters._

import org.apache.commons.io.FileUtils
import org.apache.commons.io.filefilter.TrueFileFilter
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.rdd.ShuffledRDD
import org.apache.spark.serializer.{JavaSerializer, KryoSerializer}
import org.apache.spark.shuffle.sort.SortShuffleManager
import org.apache.spark.util.Utils

class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll {

  // This test suite should run all tests in ShuffleSuite with sort-based shuffle.

  private var tempDir: File = _

  override def beforeAll() {
    super.beforeAll()
    conf.set("spark.shuffle.manager", "sort")
  }

  override def beforeEach(): Unit = {
    super.beforeEach()
    tempDir = Utils.createTempDir()
    conf.set("spark.local.dir", tempDir.getAbsolutePath)
  }

  override def afterEach(): Unit = {
    try {
      Utils.deleteRecursively(tempDir)
    } finally {
      super.afterEach()
    }
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the new serialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new KryoSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") {
    sc = new SparkContext("local", "test", conf)
    // Create a shuffled RDD and verify that it actually uses the old deserialized map output path
    val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x))
    val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4))
      .setSerializer(new JavaSerializer(conf))
    val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]]
    assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep))
    ensureFilesAreCleanedUp(shuffledRdd)
  }

  private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = {
    def getAllFiles: Set[File] =
      FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet
    val filesBeforeShuffle = getAllFiles
    // Force the shuffle to be performed
    shuffledRdd.count()
    // Ensure that the shuffle actually created files that will need to be cleaned up
    val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle
    filesCreatedByShuffle.map(_.getName) should be
    Set("shuffle_0_0_0.data", "shuffle_0_0_0.index")
    // Check that the cleanup actually removes the files
    sc.env.blockManager.master.removeShuffle(0, blocking = true)
    for (file <- filesCreatedByShuffle) {
      assert (!file.exists(), s"Shuffle file $file was not cleaned up")
    }
  }
}

org.apache.spark.serializer.KryoSerializer Scala Examples