org.apache.spark.serializer.KryoSerializer Scala Examples
The following examples show how to use org.apache.spark.serializer.KryoSerializer.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SortShuffleSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() // Once 'spark.local.dir' is set, it is cached. Unless this is manually cleared // before/after a test, it could return the same directory even if this property // is configured. Utils.clearLocalRootDirs() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) Utils.clearLocalRootDirs() } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 2
Source File: RawTextSender.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.{SparkConf, Logging} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 3
Source File: PythonBroadcastSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import scala.io.Source import java.io.{PrintWriter, File} import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 4
Source File: RawTextSender.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.{SparkConf, Logging} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match //解析使用模式匹配的参数 val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer //多次重复输入数据以填充缓冲区 val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 5
Source File: PythonBroadcastSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import scala.io.Source import java.io.{PrintWriter, File} import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 6
Source File: LabeledPointSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.serializer.KryoSerializer class LabeledPointSuite extends SparkFunSuite { test("Kryo class register") { val conf = new SparkConf(false) conf.set("spark.kryo.registrationRequired", "true") val ser = new KryoSerializer(conf).newInstance() val labeled1 = LabeledPoint(1.0, Vectors.dense(Array(1.0, 2.0))) val labeled2 = LabeledPoint(1.0, Vectors.sparse(10, Array(5, 7), Array(1.0, 2.0))) Seq(labeled1, labeled2).foreach { l => val l2 = ser.deserialize[LabeledPoint](ser.serialize(l)) assert(l === l2) } } }
Example 7
Source File: InstanceSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.ml.linalg.Vectors import org.apache.spark.serializer.KryoSerializer class InstanceSuite extends SparkFunSuite{ test("Kryo class register") { val conf = new SparkConf(false) conf.set("spark.kryo.registrationRequired", "true") val ser = new KryoSerializer(conf).newInstance() val instance1 = Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)) val instance2 = Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse) Seq(instance1, instance2).foreach { i => val i2 = ser.deserialize[Instance](ser.serialize(i)) assert(i === i2) } val oInstance1 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)) val oInstance2 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0).toSparse) Seq(oInstance1, oInstance2).foreach { o => val o2 = ser.deserialize[OffsetInstance](ser.serialize(o)) assert(o === o2) } } }
Example 8
Source File: TreePointSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer class TreePointSuite extends SparkFunSuite { test("Kryo class register") { val conf = new SparkConf(false) conf.set("spark.kryo.registrationRequired", "true") val ser = new KryoSerializer(conf).newInstance() val point = new TreePoint(1.0, Array(1, 2, 3)) val point2 = ser.deserialize[TreePoint](ser.serialize(point)) assert(point.label === point2.label) assert(point.binnedFeatures === point2.binnedFeatures) } }
Example 9
Source File: LabeledPointSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.regression import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.ml.feature.{LabeledPoint => NewLabeledPoint} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.serializer.KryoSerializer class LabeledPointSuite extends SparkFunSuite { test("parse labeled points") { val points = Seq( LabeledPoint(1.0, Vectors.dense(1.0, 0.0)), LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0)))) points.foreach { p => assert(p === LabeledPoint.parse(p.toString)) } } test("parse labeled points with whitespaces") { val point = LabeledPoint.parse("(0.0, [1.0, 2.0])") assert(point === LabeledPoint(0.0, Vectors.dense(1.0, 2.0))) } test("parse labeled points with v0.9 format") { val point = LabeledPoint.parse("1.0,1.0 0.0 -2.0") assert(point === LabeledPoint(1.0, Vectors.dense(1.0, 0.0, -2.0))) } test("conversions between new ml LabeledPoint and mllib LabeledPoint") { val points: Seq[LabeledPoint] = Seq( LabeledPoint(1.0, Vectors.dense(1.0, 0.0)), LabeledPoint(0.0, Vectors.sparse(2, Array(1), Array(-1.0)))) val newPoints: Seq[NewLabeledPoint] = points.map(_.asML) points.zip(newPoints).foreach { case (p1, p2) => assert(p1 === LabeledPoint.fromML(p2)) } } test("Kryo class register") { val conf = new SparkConf(false) conf.set("spark.kryo.registrationRequired", "true") val ser = new KryoSerializer(conf).newInstance() val labeled1 = LabeledPoint(1.0, Vectors.dense(Array(1.0, 2.0))) val labeled2 = LabeledPoint(1.0, Vectors.sparse(10, Array(5, 7), Array(1.0, 2.0))) Seq(labeled1, labeled2).foreach { l => val l2 = ser.deserialize[LabeledPoint](ser.serialize(l)) assert(l === l2) } } }
Example 10
Source File: RawTextSender.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 11
Source File: PythonBroadcastSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{File, PrintWriter} import scala.io.Source import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 12
Source File: StoragePerfTester.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.tools import java.util.concurrent.{CountDownLatch, Executors} import java.util.concurrent.atomic.AtomicLong import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.shuffle.hash.HashShuffleManager import org.apache.spark.util.Utils val numOutputSplits = sys.env.get("NUM_REDUCERS").map(_.toInt).getOrElse(500) val recordLength = 1000 // ~1KB records val totalRecords = dataSizeMb * 1000 val recordsPerMap = totalRecords / numMaps val writeKey = "1" * (recordLength / 2) val writeValue = "1" * (recordLength / 2) val executor = Executors.newFixedThreadPool(numMaps) val conf = new SparkConf() .set("spark.shuffle.compress", "false") .set("spark.shuffle.sync", "true") .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") // This is only used to instantiate a BlockManager. All thread scheduling is done manually. val sc = new SparkContext("local[4]", "Write Tester", conf) val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager] def writeOutputBytes(mapId: Int, total: AtomicLong): Unit = { val shuffle = hashShuffleManager.shuffleBlockResolver.forMapTask(1, mapId, numOutputSplits, new KryoSerializer(sc.conf), new ShuffleWriteMetrics()) val writers = shuffle.writers for (i <- 1 to recordsPerMap) { writers(i % numOutputSplits).write(writeKey, writeValue) } writers.map { w => w.commitAndClose() total.addAndGet(w.fileSegment().length) } shuffle.releaseWriters(true) } val start = System.currentTimeMillis() val latch = new CountDownLatch(numMaps) val totalBytes = new AtomicLong() for (task <- 1 to numMaps) { executor.submit(new Runnable() { override def run(): Unit = { try { writeOutputBytes(task, totalBytes) latch.countDown() } catch { case e: Exception => println("Exception in child thread: " + e + " " + e.getMessage) System.exit(1) } } }) } latch.await() val end = System.currentTimeMillis() val time = (end - start) / 1000.0 val bytesPerSecond = totalBytes.get() / time val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits)) System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile))) System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong))) executor.shutdown() sc.stop() } }
Example 13
Source File: SparkSqlSerializer.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import java.nio.ByteBuffer import java.util.{HashMap => JavaHashMap} import scala.reflect.ClassTag import com.esotericsoftware.kryo.io.{Input, Output} import com.esotericsoftware.kryo.{Kryo, Serializer} import com.twitter.chill.ResourcePool import org.apache.spark.serializer.{KryoSerializer, SerializerInstance} import org.apache.spark.sql.types.{Decimal, StructField, StructType} import org.apache.spark.util.MutablePair import org.apache.spark.{SparkConf, SparkEnv} //private[sql] class SparkSqlSerializer(conf: SparkConf) extends KryoSerializer(conf) { override def newKryo(): Kryo = { val kryo = super.newKryo() kryo.setRegistrationRequired(false) kryo.register(classOf[MutablePair[_, _]]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericRow]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericInternalRow]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericMutableRow]) kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer) kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer) kryo.register(classOf[Decimal]) kryo.register(classOf[JavaHashMap[_, _]]) // APS kryo.register(classOf[StructType]) kryo.register(classOf[StructField]) kryo.setReferences(false) kryo } } private[execution] class KryoResourcePool(size: Int) extends ResourcePool[SerializerInstance](size) { val ser: SparkSqlSerializer = { val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf()) new SparkSqlSerializer(sparkConf) } def newInstance(): SerializerInstance = ser.newInstance() } //private[sql] object SparkSqlSerializer { @transient lazy val resourcePool = new KryoResourcePool(30) private[this] def acquireRelease[O](fn: SerializerInstance => O): O = { val kryo = resourcePool.borrow try { fn(kryo) } finally { resourcePool.release(kryo) } } def serialize[T: ClassTag](o: T): Array[Byte] = acquireRelease { k => k.serialize(o).array() } def deserialize[T: ClassTag](bytes: Array[Byte]): T = acquireRelease { k => k.deserialize[T](ByteBuffer.wrap(bytes)) } } private[sql] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] { def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) { // TODO: There are probably more efficient representations than strings... output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = { new java.math.BigDecimal(input.readString()) } } private[sql] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] { def write(kryo: Kryo, output: Output, bd: BigDecimal) { // TODO: There are probably more efficient representations than strings... output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = { new java.math.BigDecimal(input.readString()) } }
Example 14
Source File: StoragePerfTester.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.tools import java.util.concurrent.{CountDownLatch, Executors} import java.util.concurrent.atomic.AtomicLong import org.apache.spark.executor.ShuffleWriteMetrics import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.shuffle.hash.HashShuffleManager import org.apache.spark.util.Utils val numOutputSplits = sys.env.get("NUM_REDUCERS").map(_.toInt).getOrElse(500) val recordLength = 1000 // ~1KB records val totalRecords = dataSizeMb * 1000 val recordsPerMap = totalRecords / numMaps val writeKey = "1" * (recordLength / 2) val writeValue = "1" * (recordLength / 2) val executor = Executors.newFixedThreadPool(numMaps) val conf = new SparkConf() .set("spark.shuffle.compress", "false") .set("spark.shuffle.sync", "true") .set("spark.shuffle.manager", "org.apache.spark.shuffle.hash.HashShuffleManager") // This is only used to instantiate a BlockManager. All thread scheduling is done manually. val sc = new SparkContext("local[4]", "Write Tester", conf) val hashShuffleManager = sc.env.shuffleManager.asInstanceOf[HashShuffleManager] def writeOutputBytes(mapId: Int, total: AtomicLong): Unit = { val shuffle = hashShuffleManager.shuffleBlockResolver.forMapTask(1, mapId, numOutputSplits, new KryoSerializer(sc.conf), new ShuffleWriteMetrics()) val writers = shuffle.writers for (i <- 1 to recordsPerMap) { writers(i % numOutputSplits).write(writeKey, writeValue) } writers.map { w => w.commitAndClose() total.addAndGet(w.fileSegment().length) } shuffle.releaseWriters(true) } val start = System.currentTimeMillis() val latch = new CountDownLatch(numMaps) val totalBytes = new AtomicLong() for (task <- 1 to numMaps) { executor.submit(new Runnable() { override def run(): Unit = { try { writeOutputBytes(task, totalBytes) latch.countDown() } catch { case e: Exception => // scalastyle:off println println("Exception in child thread: " + e + " " + e.getMessage) // scalastyle:on println System.exit(1) } } }) } latch.await() val end = System.currentTimeMillis() val time = (end - start) / 1000.0 val bytesPerSecond = totalBytes.get() / time val bytesPerFile = (totalBytes.get() / (numOutputSplits * numMaps.toDouble)).toLong // scalastyle:off println System.err.println("files_total\t\t%s".format(numMaps * numOutputSplits)) System.err.println("bytes_per_file\t\t%s".format(Utils.bytesToString(bytesPerFile))) System.err.println("agg_throughput\t\t%s/s".format(Utils.bytesToString(bytesPerSecond.toLong))) // scalastyle:on println executor.shutdown() sc.stop() } }
Example 15
Source File: RawTextSender.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.{SparkConf, Logging} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 16
Source File: PythonBroadcastSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import scala.io.Source import java.io.{PrintWriter, File} import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 17
Source File: SortShuffleSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 18
Source File: ReadingWritingData.scala From Spark-RSVD with Apache License 2.0 | 5 votes |
package com.criteo.rsvd import java.nio.ByteBuffer import com.esotericsoftware.kryo.Kryo import com.typesafe.scalalogging.slf4j.StrictLogging import de.javakaffee.kryoserializers.UnmodifiableCollectionsSerializer import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.hadoop.io.{BytesWritable, NullWritable} import org.apache.spark.mllib.linalg.distributed.MatrixEntry import org.apache.spark.rdd.RDD import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import org.apache.spark.{SparkConf, SparkContext} import scala.reflect.ClassTag object ReadingWritingData extends StrictLogging { def getInputDataSizeMB(inputPathPattern: String, sc: SparkContext): Int = { val fs = FileSystem.get(sc.hadoopConfiguration) val path = new Path(inputPathPattern) (fs.globStatus(path).map(f => f.getLen).sum / 1024 / 1024).toInt } def loadMatrixEntries(inputPath: String, singlePartitionSizeMB: Int, sc: SparkContext): RDD[MatrixEntry] = { logger.info(s"Input matrix path: $inputPath") val inputDataSizeMB = getInputDataSizeMB(inputPath + " def makeRddFromKryoFile[T: ClassTag]( sc: SparkContext, path: String, minPartitionsOpt: Option[Int] = None): RDD[T] = { val minPartitions = minPartitionsOpt.getOrElse(sc.defaultMinPartitions) val serializer = new KryoSerializer(sc.getConf) sc.sequenceFile(path, classOf[NullWritable], classOf[BytesWritable], minPartitions) .mapPartitions { it => val instance = serializer.newInstance() it.flatMap { case (_, v) => instance.deserialize[Array[T]](ByteBuffer.wrap(v.getBytes)) } } } object RandomizedSVDKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { UnmodifiableCollectionsSerializer.registerSerializers(kryo) kryo.register(classOf[MatrixEntry]) kryo.register(classOf[Array[MatrixEntry]]) } } def appendBasicRegistratorToSparkConf(sparkConf: SparkConf): SparkConf = appendRegistratorToSparkConf(sparkConf, RandomizedSVDKryoRegistrator.getClass.getName) def appendRegistratorToSparkConf(sparkConf: SparkConf, registratorName: String): SparkConf = { val oldValue = sparkConf.get("spark.kryo.registrator", "") if (oldValue == "") { sparkConf.set("spark.kryo.registrator", registratorName) } else { sparkConf.set("spark.kryo.registrator", oldValue + "," + registratorName) } } }
Example 19
Source File: LabeledPointSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import org.apache.spark.linalg.Vectors import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer class LabeledPointSuite extends SparkFunSuite { test("Kryo class register") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.registerKryoClasses( Array(classOf[scala.collection.mutable.WrappedArray.ofRef[_]], classOf[LabeledPoint])) // conf.set("spark.kryo.registrationRequired", "true") val ser = new KryoSerializer(conf).newInstance() val labeled1 = LabeledPoint(1.0, Vectors.dense(Array(1.0, 2.0))) val labeled2 = LabeledPoint(1.0, Vectors.sparse(10, Array(5, 7), Array(1.0, 2.0))) Seq(labeled1, labeled2).foreach { l => val l2 = ser.deserialize[LabeledPoint](ser.serialize(l)) assert(l === l2) } } }
Example 20
Source File: InstanceSuite.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.feature import org.apache.spark.linalg.Vectors import org.apache.spark.{SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer class InstanceSuite extends SparkFunSuite{ test("Kryo class register") { val conf = new SparkConf(false) conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") conf.registerKryoClasses( Array(classOf[scala.collection.mutable.WrappedArray.ofRef[_]], classOf[Instance])) // conf.set("spark.kryo.registrationRequired", "true") val ser = new KryoSerializer(conf).newInstance() val instance1 = Instance(19.0, 2.0, Vectors.dense(1.0, 7.0)) val instance2 = Instance(17.0, 1.0, Vectors.dense(0.0, 5.0).toSparse) Seq(instance1, instance2).foreach { i => val i2 = ser.deserialize[Instance](ser.serialize(i)) assert(i === i2) } val oInstance1 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0)) val oInstance2 = OffsetInstance(0.2, 1.0, 2.0, Vectors.dense(0.0, 5.0).toSparse) Seq(oInstance1, oInstance2).foreach { o => val o2 = ser.deserialize[OffsetInstance](ser.serialize(o)) assert(o === o2) } } }
Example 21
Source File: ByteRangesTest.scala From spark-bam with Apache License 2.0 | 5 votes |
package org.hammerlab.args import hammerlab.bytes._ import org.apache.spark.serializer.KryoSerializer import org.hammerlab.spark.confs import org.hammerlab.spark.test.serde.KryoSerialization.kryoBytes import org.hammerlab.spark.test.suite.SparkSuite import org.hammerlab.test.serde.JavaSerialization._ class ByteRangesTest extends SparkSuite with confs.Kryo { override def registrationRequired: Boolean = false test("kryo serialization") { val ks = new KryoSerializer(sc.getConf) implicit val kryo = ks.newKryo() val bytes = kryoBytes(ByteRanges(Seq(Endpoints(10.MB, 20.MB)))) bytes.length should be(69) } test("java serde") { val byteRanges = ByteRanges(Seq(Endpoints(10.MB, 20.MB))) javaRead[ByteRanges](javaBytes(byteRanges)) should be(byteRanges) } }
Example 22
Source File: ConfsTest.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark import org.apache.spark.serializer.KryoSerializer import scala.collection.mutable class ConfsTest extends ContextSuite with confs.DynamicAllocation with confs.EventLog with confs.Speculation { val eventLogDir = tmpDir().toString sparkConf( "spark.eventLog.dir" → eventLogDir ) register( classOf[Array[String]], classOf[mutable.WrappedArray.ofRef[_]], classOf[Foo] ) test("make SparkContext") { conf.get("spark.serializer") should be(classOf[KryoSerializer].getCanonicalName) conf.get("spark.dynamicAllocation.enabled") should be("true") conf.get("spark.eventLog.enabled") should be("true") conf.get("spark.eventLog.dir") should be(eventLogDir) conf.get("spark.speculation") should be("true") val strings = Array("a", "b", "c", "d") val fooBroadcast = sc.broadcast(Foo("x")) val rdd = sc.parallelize(strings) rdd.count should be(4) rdd.map(_ + fooBroadcast.value.s).collect should be(Array("ax", "bx", "cx", "dx")) } } case class Foo(s: String)
Example 23
Source File: PythonBroadcastSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{File, PrintWriter} import scala.io.Source import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 24
Source File: SortShuffleSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 25
Source File: ShapeLuceneRDDKryoRegistrator.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd.spatial.shape import com.twitter.algebird.TopK import com.twitter.chill.Kryo import org.apache.spark.SparkConf import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types._ import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc} import org.zouzias.spark.lucenerdd.spatial.shape.partition.ShapeLuceneRDDPartition class ShapeLuceneRDDKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { kryo.register(classOf[ShapeLuceneRDD[_, _]]) kryo.register(classOf[ShapeLuceneRDDPartition[_, _]]) kryo.register(classOf[Number]) kryo.register(classOf[java.lang.Double]) kryo.register(classOf[java.lang.Float]) kryo.register(classOf[java.lang.Integer]) kryo.register(classOf[java.lang.Long]) kryo.register(classOf[java.lang.Short]) kryo.register(classOf[StructType]) kryo.register(classOf[StructField]) kryo.register(classOf[IntegerType]) kryo.register(classOf[IntegerType$]) kryo.register(classOf[DoubleType]) kryo.register(classOf[DoubleType$]) kryo.register(classOf[FloatType]) kryo.register(classOf[StringType]) kryo.register(classOf[StringType$]) kryo.register(classOf[GenericRowWithSchema]) kryo.register(classOf[Metadata]) kryo.register(classOf[Object]) kryo.register(classOf[Array[Object]]) kryo.register(classOf[Array[Array[Byte]]]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofRef]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofFloat]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofDouble]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofInt]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofLong]) kryo.register(classOf[Array[String]]) kryo.register(classOf[Array[Number]]) kryo.register(classOf[Array[Float]]) kryo.register(classOf[Array[Int]]) kryo.register(classOf[Array[Long]]) kryo.register(classOf[Array[Double]]) kryo.register(classOf[Array[Boolean]]) kryo.register(classOf[Array[SparkScoreDoc]]) kryo.register(classOf[Array[StructType]]) kryo.register(classOf[Array[StructField]]) kryo.register(classOf[Range]) kryo.register(classOf[scala.collection.immutable.Map[String, String]]) kryo.register(classOf[scala.collection.immutable.Map[String, Number]]) kryo.register(classOf[scala.collection.immutable.Map$EmptyMap$]) kryo.register(classOf[scala.collection.immutable.Set$EmptySet$]) kryo.register(classOf[scala.collection.immutable.Map[_, _]]) kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]]) kryo.register(classOf[SparkFacetResult]) kryo.register(classOf[SparkScoreDoc]) kryo.register(classOf[TopK[_]]) () } } } }
Example 26
Source File: LuceneRDDKryoRegistrator.scala From spark-lucenerdd with Apache License 2.0 | 5 votes |
package org.zouzias.spark.lucenerdd import com.twitter.algebird.TopK import com.twitter.chill.Kryo import org.apache.spark.SparkConf import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import org.zouzias.spark.lucenerdd.facets.FacetedLuceneRDD import org.zouzias.spark.lucenerdd.models.{SparkFacetResult, SparkScoreDoc} import org.zouzias.spark.lucenerdd.partition.LuceneRDDPartition import org.zouzias.spark.lucenerdd.response.{LuceneRDDResponse, LuceneRDDResponsePartition} import org.zouzias.spark.lucenerdd.testing.{FavoriteCaseClass, Person} class LuceneRDDKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { kryo.register(classOf[LuceneRDD[_]]) kryo.register(classOf[LuceneRDDPartition[_]]) kryo.register(classOf[FacetedLuceneRDD[_]]) kryo.register(classOf[Number]) kryo.register(classOf[java.lang.Double]) kryo.register(classOf[java.lang.Float]) kryo.register(classOf[java.lang.Integer]) kryo.register(classOf[java.lang.Long]) kryo.register(classOf[java.lang.Short]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofRef]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofFloat]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofDouble]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofInt]) kryo.register(classOf[scala.collection.mutable.WrappedArray$ofLong]) kryo.register(classOf[Array[String]]) kryo.register(classOf[Array[Number]]) kryo.register(classOf[Array[Float]]) kryo.register(classOf[Array[Int]]) kryo.register(classOf[Array[Long]]) kryo.register(classOf[Array[Double]]) kryo.register(classOf[Array[Boolean]]) kryo.register(classOf[Range]) kryo.register(classOf[scala.collection.immutable.Map[String, String]]) kryo.register(classOf[scala.collection.immutable.Map[String, Number]]) kryo.register(classOf[scala.collection.immutable.Map$EmptyMap$]) kryo.register(classOf[scala.collection.immutable.Set$EmptySet$]) kryo.register(classOf[scala.collection.immutable.Map[_, _]]) kryo.register(classOf[Array[scala.collection.immutable.Map[_, _]]]) kryo.register(classOf[SparkFacetResult]) kryo.register(classOf[SparkScoreDoc]) kryo.register(classOf[LuceneRDDResponse]) kryo.register(classOf[LuceneRDDResponsePartition]) kryo.register(classOf[TopK[_]]) kryo.register(classOf[FavoriteCaseClass]) } }
Example 27
Source File: MatfastSerializer.scala From MatRel with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.matfast.util import java.math.BigDecimal import java.nio.ByteBuffer import java.util.{HashMap => JavaHashMap} import scala.reflect.ClassTag import com.esotericsoftware.kryo.{Kryo, Serializer} import com.esotericsoftware.kryo.io.{Input, Output} import com.twitter.chill.ResourcePool import org.apache.spark.{SparkConf, SparkEnv} import org.apache.spark.serializer.{KryoSerializer, SerializerInstance} import org.apache.spark.sql.matfast.matrix._ import org.apache.spark.sql.types.Decimal import org.apache.spark.util.MutablePair private[matfast] class MatfastSerializer(conf: SparkConf) extends KryoSerializer(conf) { override def newKryo(): Kryo = { val kryo = super.newKryo() kryo.setRegistrationRequired(false) kryo.register(classOf[MutablePair[_, _]]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericRow]) kryo.register(classOf[org.apache.spark.sql.catalyst.expressions.GenericInternalRow]) kryo.register(classOf[java.math.BigDecimal], new JavaBigDecimalSerializer) kryo.register(classOf[BigDecimal], new ScalaBigDecimalSerializer) kryo.register(classOf[Decimal]) kryo.register(classOf[JavaHashMap[_, _]]) kryo.register(classOf[DenseMatrix]) kryo.register(classOf[SparseMatrix]) kryo.setReferences(false) kryo } } private[matfast] class KryoResourcePool(size: Int) extends ResourcePool[SerializerInstance](size) { val ser: MatfastSerializer = { val sparkConf = Option(SparkEnv.get).map(_.conf).getOrElse(new SparkConf()) new MatfastSerializer(sparkConf) } def newInstance(): SerializerInstance = ser.newInstance() } private[matfast] object MatfastSerializer { @transient lazy val resourcePool = new KryoResourcePool(50) private[this] def acquireRelease[O](fn: SerializerInstance => O): O = { val kryo = resourcePool.borrow() try { fn(kryo) } finally { resourcePool.release(kryo) } } def serialize[T: ClassTag](o: T): Array[Byte] = { acquireRelease { k => k.serialize(o).array() } } def deserialize[T: ClassTag](bytes: Array[Byte]): T = acquireRelease { k => k.deserialize[T](ByteBuffer.wrap(bytes)) } } private[matfast] class JavaBigDecimalSerializer extends Serializer[java.math.BigDecimal] { def write(kryo: Kryo, output: Output, bd: java.math.BigDecimal) { output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[java.math.BigDecimal]): java.math.BigDecimal = { new java.math.BigDecimal(input.readString()) } } private[matfast] class ScalaBigDecimalSerializer extends Serializer[BigDecimal] { def write(kryo: Kryo, output: Output, bd: BigDecimal): Unit = { output.writeString(bd.toString) } def read(kryo: Kryo, input: Input, tpe: Class[BigDecimal]): BigDecimal = { new java.math.BigDecimal(input.readString()) } }
Example 28
Source File: utils.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.sql.execution.streaming.http import org.apache.spark.sql.types.StructField import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.TimestampType import org.apache.spark.SparkConf import org.apache.commons.io.IOUtils import org.apache.spark.serializer.KryoSerializer import java.io.InputStream import com.esotericsoftware.kryo.io.Input import java.io.ByteArrayOutputStream class WrongArgumentException(name: String, value: Any) extends RuntimeException(s"wrong argument: $name=$value") { } class MissingRequiredArgumentException(map: Map[String, String], paramName: String) extends RuntimeException(s"missing required argument: $paramName, all parameters=$map") { } class InvalidSerializerNameException(serializerName: String) extends RuntimeException(s"invalid serializer name: $serializerName") { } object SchemaUtils { def buildSchema(schema: StructType, includesTimestamp: Boolean, timestampColumnName: String = "_TIMESTAMP_"): StructType = { if (!includesTimestamp) schema; else StructType(schema.fields.toSeq :+ StructField(timestampColumnName, TimestampType, false)); } } object Params { def deserialize(bytes: Array[Byte]): Any = { val kryo = kryoSerializer.newKryo(); val input = new Input(); input.setBuffer(bytes); kryo.readClassAndObject(input); } }
Example 29
Source File: SerializerFactory.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
package org.apache.spark.sql.execution.streaming.http import java.nio.ByteBuffer import org.apache.spark.serializer.SerializerInstance import org.apache.spark.serializer.DeserializationStream import org.apache.spark.serializer.SerializationStream import java.io.OutputStream import java.io.InputStream import scala.reflect.ClassTag import com.fasterxml.jackson.databind.ObjectMapper import org.apache.spark.SparkConf import org.apache.spark.serializer.JavaSerializer import org.apache.spark.serializer.KryoSerializer object SerializerFactory { val DEFAULT = new SerializerFactory { override def getSerializerInstance(serializerName: String): SerializerInstance = { serializerName.toLowerCase() match { case "kryo" ⇒ new KryoSerializer(new SparkConf()).newInstance(); case "java" ⇒ new JavaSerializer(new SparkConf()).newInstance(); case _ ⇒ throw new InvalidSerializerNameException(serializerName); } } } } trait SerializerFactory { def getSerializerInstance(serializerName: String): SerializerInstance; }
Example 30
Source File: UtilsTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import java.sql.Date import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.SparkSession import org.junit.Assert import org.junit.Test import java.io.ByteArrayOutputStream import java.io.InputStream import org.apache.commons.io.IOUtils import com.esotericsoftware.kryo.io.Input import org.apache.spark.sql.execution.streaming.http.KryoSerializerUtils class UtilsTest { @Test def testKryoSerDe() { val d1 = new Date(30000); val bytes = KryoSerializerUtils.serialize(d1); val d2 = KryoSerializerUtils.deserialize(bytes); Assert.assertEquals(d1, d2); val d3 = Map('x' -> Array("aaa", "bbb"), 'y' -> Array("ccc", "ddd")); println(d3); val bytes2 = KryoSerializerUtils.serialize(d3); val d4 = KryoSerializerUtils.deserialize(bytes2).asInstanceOf[Map[String, Any]]; println(d4); } @Test def testEncoderSchema() { val spark = SparkSession.builder.master("local[4]") .getOrCreate(); val sqlContext = spark.sqlContext; import sqlContext.implicits._ import org.apache.spark.sql.catalyst.encoders.encoderFor val schema1 = encoderFor[String].schema; val schema2 = encoderFor[(String)].schema; val schema3 = encoderFor[((String))].schema; Assert.assertEquals(schema1, schema2); Assert.assertEquals(schema1, schema3); } @Test def testDateInTuple() { val spark = SparkSession.builder.master("local[4]") .getOrCreate(); val sqlContext = spark.sqlContext; import sqlContext.implicits._ val d1 = new Date(30000); val ds = sqlContext.createDataset(Seq[(Int, Date)]((1, d1))); val d2 = ds.collect()(0)._2; //NOTE: d1!=d2, maybe a bug println(d1.equals(d2)); } }
Example 31
Source File: HttpStreamServerClientTest.scala From spark-http-stream with BSD 2-Clause "Simplified" License | 5 votes |
import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql.Row import org.apache.spark.sql.execution.streaming.http.HttpStreamClient import org.junit.Assert import org.junit.Test import org.apache.spark.sql.types.LongType import org.apache.spark.sql.types.IntegerType import org.apache.spark.sql.types.DoubleType import org.apache.spark.sql.types.BooleanType import org.apache.spark.sql.types.FloatType import org.apache.spark.sql.types.StringType import org.apache.spark.sql.catalyst.expressions.GenericRowWithSchema import org.apache.spark.sql.types.StructType import org.apache.spark.sql.types.StructField import org.apache.spark.sql.SparkSession import org.apache.spark.sql.types.ByteType import org.apache.spark.sql.execution.streaming.http.HttpStreamServer import org.apache.spark.sql.execution.streaming.http.StreamPrinter import org.apache.spark.sql.execution.streaming.http.HttpStreamServerSideException class HttpStreamServerClientTest { val ROWS1 = Array(Row("hello1", 1, true, 0.1f, 0.1d, 1L, '1'.toByte), Row("hello2", 2, false, 0.2f, 0.2d, 2L, '2'.toByte), Row("hello3", 3, true, 0.3f, 0.3d, 3L, '3'.toByte)); val ROWS2 = Array(Row("hello"), Row("world"), Row("bye"), Row("world")); @Test def testHttpStreamIO() { //starts a http server val kryoSerializer = new KryoSerializer(new SparkConf()); val server = HttpStreamServer.start("/xxxx", 8080); val spark = SparkSession.builder.appName("testHttpTextSink").master("local[4]") .getOrCreate(); spark.conf.set("spark.sql.streaming.checkpointLocation", "/tmp/"); val sqlContext = spark.sqlContext; import spark.implicits._ //add a local message buffer to server, with 2 topics registered server.withBuffer() .addListener(new StreamPrinter()) .createTopic[(String, Int, Boolean, Float, Double, Long, Byte)]("topic-1") .createTopic[String]("topic-2"); val client = HttpStreamClient.connect("http://localhost:8080/xxxx"); //tests schema of topics val schema1 = client.fetchSchema("topic-1"); Assert.assertArrayEquals(Array[Object](StringType, IntegerType, BooleanType, FloatType, DoubleType, LongType, ByteType), schema1.fields.map(_.dataType).asInstanceOf[Array[Object]]); val schema2 = client.fetchSchema("topic-2"); Assert.assertArrayEquals(Array[Object](StringType), schema2.fields.map(_.dataType).asInstanceOf[Array[Object]]); //prepare to consume messages val sid1 = client.subscribe("topic-1")._1; val sid2 = client.subscribe("topic-2")._1; //produces some data client.sendRows("topic-1", 1, ROWS1); val sid4 = client.subscribe("topic-1")._1; val sid5 = client.subscribe("topic-2")._1; client.sendRows("topic-2", 1, ROWS2); //consumes data val fetched = client.fetchStream(sid1).map(_.originalRow); Assert.assertArrayEquals(ROWS1.asInstanceOf[Array[Object]], fetched.asInstanceOf[Array[Object]]); //it is empty now Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid1).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid2).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid4).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(ROWS2.asInstanceOf[Array[Object]], client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); Assert.assertArrayEquals(Array[Object](), client.fetchStream(sid5).map(_.originalRow).asInstanceOf[Array[Object]]); client.unsubscribe(sid4); try { client.fetchStream(sid4); //exception should be thrown, because subscriber id is invalidated Assert.assertTrue(false); } catch { case e: Throwable ⇒ e.printStackTrace(); Assert.assertEquals(classOf[HttpStreamServerSideException], e.getClass); } server.stop(); } }
Example 32
Source File: Kryo.scala From spark-util with Apache License 2.0 | 5 votes |
package org.hammerlab.spark.confs import org.apache.spark.serializer.{ KryoRegistrator, KryoSerializer } import org.hammerlab.kryo.Registrar import org.hammerlab.spark.SparkConfBase import scala.reflect.ClassTag case class UserRegistrar(name: String) object UserRegistrar { implicit def fromInstance[T <: KryoRegistrator](t: T): UserRegistrar = UserRegistrar(t.getClass.getName) implicit def romClass[T <: KryoRegistrator](cls: Class[T]): UserRegistrar = UserRegistrar(cls.getName) implicit def fromClassTag[T <: KryoRegistrator](implicit ct: ClassTag[T]): UserRegistrar = UserRegistrar(ct.runtimeClass.getName) } trait Kryo extends SparkConfBase with Registrar { def registrationRequired: Boolean = true def referenceTracking: Boolean = false def registrar(userRegistrar: UserRegistrar): Unit = sparkConf( "spark.kryo.registrator" → userRegistrar.name ) def registrar[T <: KryoRegistrator](implicit ct: ClassTag[T]): Unit = registrar(UserRegistrar.fromClassTag(ct)) sparkConf( "spark.serializer" → classOf[KryoSerializer].getName, "spark.kryo.referenceTracking" → referenceTracking.toString, "spark.kryo.registrationRequired" → registrationRequired.toString ) }
Example 33
Source File: RawTextSender.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 34
Source File: RawTextSender.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 35
Source File: PythonBroadcastSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{File, PrintWriter} import scala.io.Source import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 36
Source File: SortShuffleSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }
Example 37
Source File: PythonBroadcastSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import scala.io.Source import java.io.{PrintWriter, File} import org.scalatest.{Matchers, FunSuite} import org.apache.spark.{SharedSparkContext, SparkConf} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 38
Source File: TimeSeriesKryoRegistrator.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts import com.esotericsoftware.kryo.{Serializer, Kryo} import com.esotericsoftware.kryo.io.{Output, Input} import org.apache.spark.SparkConf import org.apache.spark.serializer.{KryoRegistrator, KryoSerializer} import com.cloudera.sparkts.TimeSeriesUtils._ import java.time._ class TimeSeriesKryoRegistrator extends KryoRegistrator { def registerClasses(kryo: Kryo): Unit = { kryo.register(classOf[TimeSeries[_]]) kryo.register(classOf[UniformDateTimeIndex]) kryo.register(classOf[IrregularDateTimeIndex]) kryo.register(classOf[BusinessDayFrequency]) kryo.register(classOf[DayFrequency]) kryo.register(classOf[ZonedDateTime], new DateTimeSerializer) } } class DateTimeSerializer extends Serializer[ZonedDateTime] { def write(kryo: Kryo, out: Output, dt: ZonedDateTime): Unit = { out.writeLong(zonedDateTimeToLong(dt), true) } def read(kryo: Kryo, in: Input, clazz: Class[ZonedDateTime]): ZonedDateTime = { longToZonedDateTime(in.readLong(true), ZoneId.systemDefault()) } } object TimeSeriesKryoRegistrator { def registerKryoClasses(conf: SparkConf): Unit = { conf.set("spark.serializer", classOf[KryoSerializer].getName) conf.set("spark.kryo.registrator", classOf[TimeSeriesKryoRegistrator].getName) } }
Example 39
Source File: Analytics.scala From osmesa with Apache License 2.0 | 5 votes |
package osmesa.analytics import geotrellis.spark.io.kryo.KryoRegistrator import org.apache.spark.SparkConf import org.apache.spark.serializer.KryoSerializer import org.apache.spark.sql._ import org.locationtech.geomesa.spark.jts._ object Analytics { def sparkSession(appName: String): SparkSession = { val conf = new SparkConf() .setIfMissing("spark.master", "local[*]") .setAppName(s"OSMesa Analytics - ${appName}") .set("spark.sql.orc.impl", "native") .set("spark.sql.orc.filterPushdown", "true") .set("spark.sql.parquet.mergeSchema", "false") .set("spark.sql.parquet.filterPushdown", "true") .set("spark.sql.hive.metastorePartitionPruning", "true") .set("spark.ui.showConsoleProgress", "true") .set("spark.serializer", classOf[KryoSerializer].getName) .set("spark.kryo.registrator", classOf[KryoRegistrator].getName) SparkSession.builder .config(conf) .enableHiveSupport .getOrCreate .withJTS } }
Example 40
Source File: RawTextSender.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import java.io.{ByteArrayOutputStream, IOException} import java.net.ServerSocket import java.nio.ByteBuffer import scala.io.Source import org.apache.spark.SparkConf import org.apache.spark.internal.Logging import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.IntParam private[streaming] object RawTextSender extends Logging { def main(args: Array[String]) { if (args.length != 4) { // scalastyle:off println System.err.println("Usage: RawTextSender <port> <file> <blockSize> <bytesPerSec>") // scalastyle:on println System.exit(1) } // Parse the arguments using a pattern match val Array(IntParam(port), file, IntParam(blockSize), IntParam(bytesPerSec)) = args // Repeat the input data multiple times to fill in a buffer val lines = Source.fromFile(file).getLines().toArray val bufferStream = new ByteArrayOutputStream(blockSize + 1000) val ser = new KryoSerializer(new SparkConf()).newInstance() val serStream = ser.serializeStream(bufferStream) var i = 0 while (bufferStream.size < blockSize) { serStream.writeObject(lines(i)) i = (i + 1) % lines.length } val array = bufferStream.toByteArray val countBuf = ByteBuffer.wrap(new Array[Byte](4)) countBuf.putInt(array.length) countBuf.flip() val serverSocket = new ServerSocket(port) logInfo("Listening on port " + port) while (true) { val socket = serverSocket.accept() logInfo("Got a new connection") val out = new RateLimitedOutputStream(socket.getOutputStream, bytesPerSec) try { while (true) { out.write(countBuf.array) out.write(array) } } catch { case e: IOException => logError("Client disconnected") } finally { socket.close() } } } }
Example 41
Source File: PythonBroadcastSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{File, PrintWriter} import scala.io.Source import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 42
Source File: SortShuffleSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark import java.io.File import scala.collection.JavaConverters._ import org.apache.commons.io.FileUtils import org.apache.commons.io.filefilter.TrueFileFilter import org.scalatest.BeforeAndAfterAll import org.apache.spark.rdd.ShuffledRDD import org.apache.spark.serializer.{JavaSerializer, KryoSerializer} import org.apache.spark.shuffle.sort.SortShuffleManager import org.apache.spark.util.Utils class SortShuffleSuite extends ShuffleSuite with BeforeAndAfterAll { // This test suite should run all tests in ShuffleSuite with sort-based shuffle. private var tempDir: File = _ override def beforeAll() { super.beforeAll() conf.set("spark.shuffle.manager", "sort") } override def beforeEach(): Unit = { super.beforeEach() tempDir = Utils.createTempDir() conf.set("spark.local.dir", tempDir.getAbsolutePath) } override def afterEach(): Unit = { try { Utils.deleteRecursively(tempDir) } finally { super.afterEach() } } test("SortShuffleManager properly cleans up files for shuffles that use the serialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the new serialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new KryoSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } test("SortShuffleManager properly cleans up files for shuffles that use the deserialized path") { sc = new SparkContext("local", "test", conf) // Create a shuffled RDD and verify that it actually uses the old deserialized map output path val rdd = sc.parallelize(1 to 10, 1).map(x => (x, x)) val shuffledRdd = new ShuffledRDD[Int, Int, Int](rdd, new HashPartitioner(4)) .setSerializer(new JavaSerializer(conf)) val shuffleDep = shuffledRdd.dependencies.head.asInstanceOf[ShuffleDependency[_, _, _]] assert(!SortShuffleManager.canUseSerializedShuffle(shuffleDep)) ensureFilesAreCleanedUp(shuffledRdd) } private def ensureFilesAreCleanedUp(shuffledRdd: ShuffledRDD[_, _, _]): Unit = { def getAllFiles: Set[File] = FileUtils.listFiles(tempDir, TrueFileFilter.INSTANCE, TrueFileFilter.INSTANCE).asScala.toSet val filesBeforeShuffle = getAllFiles // Force the shuffle to be performed shuffledRdd.count() // Ensure that the shuffle actually created files that will need to be cleaned up val filesCreatedByShuffle = getAllFiles -- filesBeforeShuffle filesCreatedByShuffle.map(_.getName) should be Set("shuffle_0_0_0.data", "shuffle_0_0_0.index") // Check that the cleanup actually removes the files sc.env.blockManager.master.removeShuffle(0, blocking = true) for (file <- filesCreatedByShuffle) { assert (!file.exists(), s"Shuffle file $file was not cleaned up") } } }