org.apache.spark.SharedSparkContext Scala Examples
The following examples show how to use org.apache.spark.SharedSparkContext.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SerDeUtilSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.{SharedSparkContext, SparkFunSuite} class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext { test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) SerDeUtil.pairRDDToPython(emptyRdd, 10) } test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) val javaRdd = emptyRdd.toJavaRDD() val pythonRdd = SerDeUtil.javaToPython(javaRdd) SerDeUtil.pythonToPairRDD(pythonRdd, false) } }
Example 2
Source File: ZippedPartitionsSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { Iterator(i.toArray.size, s.toArray.size, d.toArray.size) } } class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { test("print sizes") { val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) val data3 = sc.makeRDD(Array(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) assert(obtainedSizes.size == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } }
Example 3
Source File: PythonBroadcastSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import scala.io.Source import java.io.{PrintWriter, File} import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 4
Source File: SerDeUtilSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.{SharedSparkContext, SparkFunSuite} class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext { test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) SerDeUtil.pairRDDToPython(emptyRdd, 10) } test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) val javaRdd = emptyRdd.toJavaRDD() val pythonRdd = SerDeUtil.javaToPython(javaRdd) SerDeUtil.pythonToPairRDD(pythonRdd, false) } }
Example 5
Source File: ProactiveClosureSerializationSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 6
Source File: PartitionwiseSampledRDDSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 7
Source File: PartitionPruningRDDSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 8
Source File: ZippedPartitionsSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { Iterator(i.toArray.size, s.toArray.size, d.toArray.size) } } class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { test("print sizes") { val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) val data3 = sc.makeRDD(Array(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) assert(obtainedSizes.size == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } }
Example 9
Source File: PythonBroadcastSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import scala.io.Source import java.io.{PrintWriter, File} import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 10
Source File: SerDeUtilSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.{SharedSparkContext, SparkFunSuite} class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext { //将空对RDD转换为python不会引发异常 test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) SerDeUtil.pairRDDToPython(emptyRdd, 10) } //将空python RDD转换为RDD不会引发异常 test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) val javaRdd = emptyRdd.toJavaRDD() val pythonRdd = SerDeUtil.javaToPython(javaRdd) SerDeUtil.pythonToPairRDD(pythonRdd, false) } }
Example 11
Source File: GenericAvroSerializerSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.nio.ByteBuffer import com.esotericsoftware.kryo.io.{Output, Input} import org.apache.avro.{SchemaBuilder, Schema} import org.apache.avro.generic.GenericData.Record import org.apache.spark.{SparkFunSuite, SharedSparkContext} class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val schema : Schema = SchemaBuilder .record("testRecord").fields() .requiredString("data") .endRecord() val record = new Record(schema) record.put("data", "test data") test("schema compression and decompression") {//模式压缩与解压缩 val genericSer = new GenericAvroSerializer(conf.getAvroSchema) assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema)))) } test("record serialization and deserialization") {//记录序列化和反序列化 val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val outputStream = new ByteArrayOutputStream() val output = new Output(outputStream) genericSer.serializeDatum(record, output) output.flush() output.close() val input = new Input(new ByteArrayInputStream(outputStream.toByteArray)) assert(genericSer.deserializeDatum(input) === record) } //使用模式指纹以减少信息大小 test("uses schema fingerprint to decrease message size") { val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema) val output = new Output(new ByteArrayOutputStream()) val beginningNormalPosition = output.total() genericSerFull.serializeDatum(record, output) output.flush() val normalLength = output.total - beginningNormalPosition conf.registerAvroSchemas(schema) val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema) val beginningFingerprintPosition = output.total() genericSerFinger.serializeDatum(record, output) val fingerprintLength = output.total - beginningFingerprintPosition assert(fingerprintLength < normalLength) } test("caches previously seen schemas") {//缓存之前模式 val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val compressedSchema = genericSer.compress(schema) val decompressedScheam = genericSer.decompress(ByteBuffer.wrap(compressedSchema)) assert(compressedSchema.eq(genericSer.compress(schema))) assert(decompressedScheam.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema)))) } }
Example 12
Source File: ProactiveClosureSerializationSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } //在一个活动的序列化异常,抛出预期的序列化异常 test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each //有可能是一个更清洁的方式来消除样板, for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 13
Source File: PartitionwiseSampledRDDSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") {//分布式种子 val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") {//并发 // SPARK-2251: zip with self computes each partition twice. //用自己计算每个分区的两倍 // We want to make sure there are no concurrency issues. //我们要确保没有并发问题 val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 14
Source File: PartitionPruningRDDSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") {//修剪的分区设置的正确性 val rdd = new RDD[Int](sc, Nil) {//列表结尾为Nil override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") {//修剪分区可以联合 //列表结尾为Nil val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 15
Source File: ZippedPartitionsSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { Iterator(i.toArray.size, s.toArray.size, d.toArray.size) } } class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { test("print sizes") {//打印大小 val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) val data3 = sc.makeRDD(Array(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) assert(obtainedSizes.size == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } }
Example 16
Source File: PythonBroadcastSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{File, PrintWriter} import scala.io.Source import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 17
Source File: PartitionPruningRDDSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 18
Source File: GenericAvroSerializerSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.nio.ByteBuffer import com.esotericsoftware.kryo.io.{Input, Output} import org.apache.avro.{Schema, SchemaBuilder} import org.apache.avro.generic.GenericData.Record import org.apache.spark.{SharedSparkContext, SparkFunSuite} class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val schema : Schema = SchemaBuilder .record("testRecord").fields() .requiredString("data") .endRecord() val record = new Record(schema) record.put("data", "test data") test("schema compression and decompression") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema)))) } test("record serialization and deserialization") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val outputStream = new ByteArrayOutputStream() val output = new Output(outputStream) genericSer.serializeDatum(record, output) output.flush() output.close() val input = new Input(new ByteArrayInputStream(outputStream.toByteArray)) assert(genericSer.deserializeDatum(input) === record) } test("uses schema fingerprint to decrease message size") { val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema) val output = new Output(new ByteArrayOutputStream()) val beginningNormalPosition = output.total() genericSerFull.serializeDatum(record, output) output.flush() val normalLength = output.total - beginningNormalPosition conf.registerAvroSchemas(schema) val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema) val beginningFingerprintPosition = output.total() genericSerFinger.serializeDatum(record, output) val fingerprintLength = output.total - beginningFingerprintPosition assert(fingerprintLength < normalLength) } test("caches previously seen schemas") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val compressedSchema = genericSer.compress(schema) val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema)) assert(compressedSchema.eq(genericSer.compress(schema))) assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema)))) } }
Example 19
Source File: ProactiveClosureSerializationSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 20
Source File: PartitionwiseSampledRDDSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(): Int = 1 override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 21
Source File: PartitionPruningRDDSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 22
Source File: ZippedPartitionsSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { Iterator(i.toArray.size, s.toArray.size, d.toArray.size) } } class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { test("print sizes") { val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) val data3 = sc.makeRDD(Array(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) assert(obtainedSizes.size == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } }
Example 23
Source File: ChunkedByteBufferSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.io import java.nio.ByteBuffer import com.google.common.io.ByteStreams import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.internal.config import org.apache.spark.network.util.ByteArrayWritableChannel import org.apache.spark.util.io.ChunkedByteBuffer class ChunkedByteBufferSuite extends SparkFunSuite with SharedSparkContext { test("no chunks") { val emptyChunkedByteBuffer = new ChunkedByteBuffer(Array.empty[ByteBuffer]) assert(emptyChunkedByteBuffer.size === 0) assert(emptyChunkedByteBuffer.getChunks().isEmpty) assert(emptyChunkedByteBuffer.toArray === Array.empty) assert(emptyChunkedByteBuffer.toByteBuffer.capacity() === 0) assert(emptyChunkedByteBuffer.toNetty.capacity() === 0) emptyChunkedByteBuffer.toInputStream(dispose = false).close() emptyChunkedByteBuffer.toInputStream(dispose = true).close() } test("getChunks() duplicates chunks") { val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(8))) chunkedByteBuffer.getChunks().head.position(4) assert(chunkedByteBuffer.getChunks().head.position() === 0) } test("copy() does not affect original buffer's position") { val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(8))) chunkedByteBuffer.copy(ByteBuffer.allocate) assert(chunkedByteBuffer.getChunks().head.position() === 0) } test("writeFully() does not affect original buffer's position") { val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(8))) chunkedByteBuffer.writeFully(new ByteArrayWritableChannel(chunkedByteBuffer.size.toInt)) assert(chunkedByteBuffer.getChunks().head.position() === 0) } test("SPARK-24107: writeFully() write buffer which is larger than bufferWriteChunkSize") { try { sc.conf.set(config.BUFFER_WRITE_CHUNK_SIZE, 32L * 1024L * 1024L) val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(40 * 1024 * 1024))) val byteArrayWritableChannel = new ByteArrayWritableChannel(chunkedByteBuffer.size.toInt) chunkedByteBuffer.writeFully(byteArrayWritableChannel) assert(byteArrayWritableChannel.length() === chunkedByteBuffer.size) } finally { sc.conf.remove(config.BUFFER_WRITE_CHUNK_SIZE) } } test("toArray()") { val empty = ByteBuffer.wrap(Array.empty[Byte]) val bytes = ByteBuffer.wrap(Array.tabulate(8)(_.toByte)) val chunkedByteBuffer = new ChunkedByteBuffer(Array(bytes, bytes, empty)) assert(chunkedByteBuffer.toArray === bytes.array() ++ bytes.array()) } test("toArray() throws UnsupportedOperationException if size exceeds 2GB") { val fourMegabyteBuffer = ByteBuffer.allocate(1024 * 1024 * 4) fourMegabyteBuffer.limit(fourMegabyteBuffer.capacity()) val chunkedByteBuffer = new ChunkedByteBuffer(Array.fill(1024)(fourMegabyteBuffer)) assert(chunkedByteBuffer.size === (1024L * 1024L * 1024L * 4L)) intercept[UnsupportedOperationException] { chunkedByteBuffer.toArray } } test("toInputStream()") { val empty = ByteBuffer.wrap(Array.empty[Byte]) val bytes1 = ByteBuffer.wrap(Array.tabulate(256)(_.toByte)) val bytes2 = ByteBuffer.wrap(Array.tabulate(128)(_.toByte)) val chunkedByteBuffer = new ChunkedByteBuffer(Array(empty, bytes1, bytes2)) assert(chunkedByteBuffer.size === bytes1.limit() + bytes2.limit()) val inputStream = chunkedByteBuffer.toInputStream(dispose = false) val bytesFromStream = new Array[Byte](chunkedByteBuffer.size.toInt) ByteStreams.readFully(inputStream, bytesFromStream) assert(bytesFromStream === bytes1.array() ++ bytes2.array()) assert(chunkedByteBuffer.getChunks().head.position() === 0) } }
Example 24
Source File: SQLContextSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql import org.apache.spark.{SharedSparkContext, SparkFunSuite} class SQLContextSuite extends SparkFunSuite with SharedSparkContext{ test("getOrCreate instantiates SQLContext") { val sqlContext = SQLContext.getOrCreate(sc) assert(sqlContext != null, "SQLContext.getOrCreate returned null") assert(SQLContext.getOrCreate(sc).eq(sqlContext), "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate") } test("getOrCreate return the original SQLContext") { val sqlContext = SQLContext.getOrCreate(sc) val newSession = sqlContext.newSession() assert(SQLContext.getOrCreate(sc).eq(sqlContext), "SQLContext.getOrCreate after explicitly created SQLContext did not return the context") SQLContext.setActive(newSession) assert(SQLContext.getOrCreate(sc).eq(newSession), "SQLContext.getOrCreate after explicitly setActive() did not return the active context") } test("Sessions of SQLContext") { val sqlContext = SQLContext.getOrCreate(sc) val session1 = sqlContext.newSession() val session2 = sqlContext.newSession() // all have the default configurations val key = SQLConf.SHUFFLE_PARTITIONS.key assert(session1.getConf(key) === session2.getConf(key)) session1.setConf(key, "1") session2.setConf(key, "2") assert(session1.getConf(key) === "1") assert(session2.getConf(key) === "2") // temporary table should not be shared val df = session1.range(10) df.registerTempTable("test1") assert(session1.tableNames().contains("test1")) assert(!session2.tableNames().contains("test1")) // UDF should not be shared def myadd(a: Int, b: Int): Int = a + b session1.udf.register[Int, Int, Int]("myadd", myadd) session1.sql("select myadd(1, 2)").explain() intercept[AnalysisException] { session2.sql("select myadd(1, 2)").explain() } } test("SPARK-13390: createDataFrame(java.util.List[_],Class[_]) NotSerializableException") { val rows = new java.util.ArrayList[IntJavaBean]() rows.add(new IntJavaBean(1)) val sqlContext = SQLContext.getOrCreate(sc) // Without the fix for SPARK-13390, this will throw NotSerializableException sqlContext.createDataFrame(rows, classOf[IntJavaBean]).groupBy("int").count().collect() } } class IntJavaBean(private var i: Int) extends Serializable { def getInt(): Int = i def setInt(i: Int): Unit = { this.i = i } }
Example 25
Source File: PythonBroadcastSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import scala.io.Source import java.io.{PrintWriter, File} import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 26
Source File: SerDeUtilSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.{SharedSparkContext, SparkFunSuite} class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext { test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) SerDeUtil.pairRDDToPython(emptyRdd, 10) } test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) val javaRdd = emptyRdd.toJavaRDD() val pythonRdd = SerDeUtil.javaToPython(javaRdd) SerDeUtil.pythonToPairRDD(pythonRdd, false) } }
Example 27
Source File: GenericAvroSerializerSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.nio.ByteBuffer import com.esotericsoftware.kryo.io.{Output, Input} import org.apache.avro.{SchemaBuilder, Schema} import org.apache.avro.generic.GenericData.Record import org.apache.spark.{SparkFunSuite, SharedSparkContext} class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val schema : Schema = SchemaBuilder .record("testRecord").fields() .requiredString("data") .endRecord() val record = new Record(schema) record.put("data", "test data") test("schema compression and decompression") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema)))) } test("record serialization and deserialization") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val outputStream = new ByteArrayOutputStream() val output = new Output(outputStream) genericSer.serializeDatum(record, output) output.flush() output.close() val input = new Input(new ByteArrayInputStream(outputStream.toByteArray)) assert(genericSer.deserializeDatum(input) === record) } test("uses schema fingerprint to decrease message size") { val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema) val output = new Output(new ByteArrayOutputStream()) val beginningNormalPosition = output.total() genericSerFull.serializeDatum(record, output) output.flush() val normalLength = output.total - beginningNormalPosition conf.registerAvroSchemas(schema) val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema) val beginningFingerprintPosition = output.total() genericSerFinger.serializeDatum(record, output) val fingerprintLength = output.total - beginningFingerprintPosition assert(fingerprintLength < normalLength) } test("caches previously seen schemas") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val compressedSchema = genericSer.compress(schema) val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema)) assert(compressedSchema.eq(genericSer.compress(schema))) assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema)))) } }
Example 28
Source File: ProactiveClosureSerializationSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 29
Source File: PartitionwiseSampledRDDSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 30
Source File: PartitionPruningRDDSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 31
Source File: ZippedPartitionsSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { Iterator(i.toArray.size, s.toArray.size, d.toArray.size) } } class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { test("print sizes") { val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) val data3 = sc.makeRDD(Array(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) assert(obtainedSizes.size == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } }
Example 32
Source File: PartitionwiseSampledRDDSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(): Int = 1 override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 33
Source File: SerDeUtilSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.{SharedSparkContext, SparkFunSuite} class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext { test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) SerDeUtil.pairRDDToPython(emptyRdd, 10) } test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) val javaRdd = emptyRdd.toJavaRDD() val pythonRdd = SerDeUtil.javaToPython(javaRdd) SerDeUtil.pythonToPairRDD(pythonRdd, false) } }
Example 34
Source File: GenericAvroSerializerSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.nio.ByteBuffer import com.esotericsoftware.kryo.io.{Input, Output} import org.apache.avro.{Schema, SchemaBuilder} import org.apache.avro.generic.GenericData.Record import org.apache.spark.{SharedSparkContext, SparkFunSuite} class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val schema : Schema = SchemaBuilder .record("testRecord").fields() .requiredString("data") .endRecord() val record = new Record(schema) record.put("data", "test data") test("schema compression and decompression") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema)))) } test("record serialization and deserialization") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val outputStream = new ByteArrayOutputStream() val output = new Output(outputStream) genericSer.serializeDatum(record, output) output.flush() output.close() val input = new Input(new ByteArrayInputStream(outputStream.toByteArray)) assert(genericSer.deserializeDatum(input) === record) } test("uses schema fingerprint to decrease message size") { val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema) val output = new Output(new ByteArrayOutputStream()) val beginningNormalPosition = output.total() genericSerFull.serializeDatum(record, output) output.flush() val normalLength = output.total - beginningNormalPosition conf.registerAvroSchemas(schema) val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema) val beginningFingerprintPosition = output.total() genericSerFinger.serializeDatum(record, output) val fingerprintLength = output.total - beginningFingerprintPosition assert(fingerprintLength < normalLength) } test("caches previously seen schemas") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val compressedSchema = genericSer.compress(schema) val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema)) assert(compressedSchema.eq(genericSer.compress(schema))) assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema)))) } }
Example 35
Source File: ProactiveClosureSerializationSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 36
Source File: PartitionwiseSampledRDDSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(): Int = 1 override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 37
Source File: PartitionPruningRDDSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 38
Source File: ZippedPartitionsSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { Iterator(i.toArray.size, s.toArray.size, d.toArray.size) } } class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { test("print sizes") { val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) val data3 = sc.makeRDD(Array(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) assert(obtainedSizes.size == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } }
Example 39
Source File: X2PSuite.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import org.apache.spark.SharedSparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.scalatest.{FunSuite, Matchers} class X2PSuite extends FunSuite with SharedSparkContext with Matchers { test("Test X2P against tsne.jl implementation") { val input = new RowMatrix( sc.parallelize(Seq(1 to 3, 4 to 6, 7 to 9, 10 to 12)) .map(x => Vectors.dense(x.map(_.toDouble).toArray)) ) val output = X2P(input, 1e-5, 2).toRowMatrix().rows.collect().map(_.toArray.toList) println(output.toList) //output shouldBe List(List(0, .5, .5), List(.5, 0, .5), List(.5, .5, .0)) } }
Example 40
Source File: VirtualScreeningTest.scala From MaRe with Apache License 2.0 | 5 votes |
package se.uu.it.mare import java.io.File import java.util.UUID import scala.io.Source import scala.util.Properties import org.apache.spark.SharedSparkContext import org.junit.runner.RunWith import org.scalatest.FunSuite import org.scalatest.junit.JUnitRunner private object SDFUtils { def parseIDsAndScores(sdf: String): Array[(String, String)] = { sdf.split("\\n\\$\\$\\$\\$\\n").map { mol => val lines = mol.split("\\n") (lines(0), lines.last) } } } @RunWith(classOf[JUnitRunner]) class VirtualScreeningTest extends FunSuite with SharedSparkContext { private val tmpDir = new File(Properties.envOrElse("TMPDIR", "/tmp")) test("Virtual Screening") { sc.hadoopConfiguration.set("textinputformat.record.delimiter", "\n$$$$\n") val mols = sc.textFile(getClass.getResource("sdf/molecules.sdf").getPath) // Parallel execution with MaRe val hitsParallel = new MaRe(mols) .map( inputMountPoint = TextFile("/input.sdf", "\n$$$$\n"), outputMountPoint = TextFile("/output.sdf", "\n$$$$\n"), imageName = "mcapuccini/oe:latest", command = "fred -receptor /var/openeye/hiv1_protease.oeb " + "-hitlist_size 0 " + "-conftest none " + "-dock_resolution Low " + "-dbase /input.sdf " + "-docked_molecule_file /output.sdf") .reduce( inputMountPoint = TextFile("/input.sdf", "\n$$$$\n"), outputMountPoint = TextFile("/output.sdf", "\n$$$$\n"), imageName = "mcapuccini/sdsorter:latest", command = "sdsorter -reversesort='FRED Chemgauss4 score' " + "-keep-tag='FRED Chemgauss4 score' " + "-nbest=30 " + "/input.sdf " + "/output.sdf") .rdd.collect.mkString("\n$$$$\n") // Serial execution val inputFile = new File(getClass.getResource("sdf/molecules.sdf").getPath) val dockedFile = new File(tmpDir, "mare_test_" + UUID.randomUUID.toString) dockedFile.createNewFile dockedFile.deleteOnExit val outputFile = new File(tmpDir, "mare_test_" + UUID.randomUUID.toString) outputFile.createNewFile outputFile.deleteOnExit DockerHelper.run( imageName = "mcapuccini/oe:latest", command = "fred -receptor /var/openeye/hiv1_protease.oeb " + "-hitlist_size 0 " + "-conftest none " + "-dock_resolution Low " + "-dbase /input.sdf " + "-docked_molecule_file /docked.sdf", bindFiles = Seq(inputFile, dockedFile), volumeFiles = Seq(new File("/input.sdf"), new File("/docked.sdf")), forcePull = false) DockerHelper.run( imageName = "mcapuccini/sdsorter:latest", command = "sdsorter -reversesort='FRED Chemgauss4 score' " + "-keep-tag='FRED Chemgauss4 score' " + "-nbest=30 " + "/docked.sdf " + "/output.sdf", bindFiles = Seq(dockedFile, outputFile), volumeFiles = Seq(new File("/docked.sdf"), new File("/output.sdf")), forcePull = false) val hitsSerial = Source.fromFile(outputFile).mkString // Test val parallel = SDFUtils.parseIDsAndScores(hitsParallel) val serial = SDFUtils.parseIDsAndScores(hitsSerial) assert(parallel.deep == serial.deep) } }
Example 41
Source File: LogisticRegressionTest.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.alg import org.apache.spark.SharedSparkContext import org.junit.runner.RunWith import org.scalatest.FunSuite import se.uu.farmbio.cp.ICP import se.uu.farmbio.cp.TestUtils import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class LogisticRegressionTest extends FunSuite with SharedSparkContext { test("test performance") { val trainData = TestUtils.generateBinaryData(100, 11) val testData = TestUtils.generateBinaryData(30, 22) val (calibration, properTrain) = ICP.calibrationSplit(sc.parallelize(trainData), 16) val lr = new LogisticRegression(properTrain, 30) val model = ICP.trainClassifier(lr, numClasses=2, calibration) assert(TestUtils.testPerformance(model, sc.parallelize(testData))) } }
Example 42
Source File: SVMTest.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.alg import org.apache.spark.SharedSparkContext import org.junit.runner.RunWith import org.scalatest.FunSuite import se.uu.farmbio.cp.ICP import se.uu.farmbio.cp.TestUtils import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class SVMTest extends FunSuite with SharedSparkContext { test("test performance") { val trainData = TestUtils.generateBinaryData(100, 11) val testData = TestUtils.generateBinaryData(30, 22) val (calibration, properTrain) = ICP.calibrationSplit(sc.parallelize(trainData), 16) val svm = new SVM(properTrain, 30) val model = ICP.trainClassifier(svm, numClasses=2, calibration) assert(TestUtils.testPerformance(model, sc.parallelize(testData))) } }
Example 43
Source File: GBTTest.scala From spark-cp with Apache License 2.0 | 5 votes |
package se.uu.farmbio.cp.alg import scala.util.Random import org.apache.spark.SharedSparkContext import org.junit.runner.RunWith import org.scalatest.FunSuite import se.uu.farmbio.cp.ICP import se.uu.farmbio.cp.TestUtils import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class GBTTest extends FunSuite with SharedSparkContext { Random.setSeed(11) test("test performance") { val trainData = TestUtils.generateBinaryData(100, 11) val testData = TestUtils.generateBinaryData(30, 22) val (calibration, properTrain) = ICP.calibrationSplit(sc.parallelize(trainData), 16) val gbt = new GBT(properTrain, 30) val model = ICP.trainClassifier(gbt, numClasses=2, calibration) assert(TestUtils.testPerformance(model, sc.parallelize(testData))) } }
Example 44
Source File: PythonBroadcastSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{File, PrintWriter} import scala.io.Source import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 45
Source File: SerDeUtilSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.{SharedSparkContext, SparkFunSuite} class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext { test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) SerDeUtil.pairRDDToPython(emptyRdd, 10) } test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) val javaRdd = emptyRdd.toJavaRDD() val pythonRdd = SerDeUtil.javaToPython(javaRdd) SerDeUtil.pythonToPairRDD(pythonRdd, false) } }
Example 46
Source File: GenericAvroSerializerSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.nio.ByteBuffer import com.esotericsoftware.kryo.io.{Input, Output} import org.apache.avro.{Schema, SchemaBuilder} import org.apache.avro.generic.GenericData.Record import org.apache.spark.{SharedSparkContext, SparkFunSuite} class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val schema : Schema = SchemaBuilder .record("testRecord").fields() .requiredString("data") .endRecord() val record = new Record(schema) record.put("data", "test data") test("schema compression and decompression") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema)))) } test("record serialization and deserialization") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val outputStream = new ByteArrayOutputStream() val output = new Output(outputStream) genericSer.serializeDatum(record, output) output.flush() output.close() val input = new Input(new ByteArrayInputStream(outputStream.toByteArray)) assert(genericSer.deserializeDatum(input) === record) } test("uses schema fingerprint to decrease message size") { val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema) val output = new Output(new ByteArrayOutputStream()) val beginningNormalPosition = output.total() genericSerFull.serializeDatum(record, output) output.flush() val normalLength = output.total - beginningNormalPosition conf.registerAvroSchemas(schema) val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema) val beginningFingerprintPosition = output.total() genericSerFinger.serializeDatum(record, output) val fingerprintLength = output.total - beginningFingerprintPosition assert(fingerprintLength < normalLength) } test("caches previously seen schemas") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val compressedSchema = genericSer.compress(schema) val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema)) assert(compressedSchema.eq(genericSer.compress(schema))) assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema)))) } }
Example 47
Source File: ProactiveClosureSerializationSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 48
Source File: PythonBroadcastSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{File, PrintWriter} import scala.io.Source import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 49
Source File: PartitionPruningRDDSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext} class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index: Int = i def testValue: Int = this.value }
Example 50
Source File: ZippedPartitionsSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { Iterator(i.toArray.size, s.toArray.size, d.toArray.size) } } class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext { test("print sizes") { val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) val data3 = sc.makeRDD(Array(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) assert(obtainedSizes.size == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } }
Example 51
Source File: PythonBroadcastSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import scala.io.Source import java.io.{PrintWriter, File} import org.scalatest.{Matchers, FunSuite} import org.apache.spark.{SharedSparkContext, SparkConf} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 52
Source File: SerDeUtilSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.scalatest.FunSuite import org.apache.spark.SharedSparkContext class SerDeUtilSuite extends FunSuite with SharedSparkContext { test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) SerDeUtil.pairRDDToPython(emptyRdd, 10) } test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) val javaRdd = emptyRdd.toJavaRDD() val pythonRdd = SerDeUtil.javaToPython(javaRdd) SerDeUtil.pythonToPairRDD(pythonRdd, false) } }
Example 53
Source File: ProactiveClosureSerializationSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.scalatest.FunSuite import org.apache.spark.{SharedSparkContext, SparkException} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T) = x.toString def pred[T](x: T) = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends FunSuite with SharedSparkContext { def fixture = (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y=>uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y=>Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y=>uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y=>uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y=>uc.op(y))) }
Example 54
Source File: FlatmapIteratorSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.storage import org.scalatest.FunSuite import org.apache.spark.{SharedSparkContext, SparkConf, LocalSparkContext, SparkContext} class FlatmapIteratorSuite extends FunSuite with LocalSparkContext { test("Flatmap Iterator to Disk") { val sconf = new SparkConf().setMaster("local").setAppName("iterator_to_disk_test") sc = new SparkContext(sconf) val expand_size = 100 val data = sc.parallelize((1 to 5).toSeq). flatMap( x => Stream.range(0, expand_size)) var persisted = data.persist(StorageLevel.DISK_ONLY) assert(persisted.count()===500) assert(persisted.filter(_==1).count()===5) } test("Flatmap Iterator to Memory") { val sconf = new SparkConf().setMaster("local").setAppName("iterator_to_disk_test") sc = new SparkContext(sconf) val expand_size = 100 val data = sc.parallelize((1 to 5).toSeq). flatMap(x => Stream.range(0, expand_size)) var persisted = data.persist(StorageLevel.MEMORY_ONLY) assert(persisted.count()===500) assert(persisted.filter(_==1).count()===5) } test("Serializer Reset") { val sconf = new SparkConf().setMaster("local").setAppName("serializer_reset_test") .set("spark.serializer.objectStreamReset", "10") sc = new SparkContext(sconf) val expand_size = 500 val data = sc.parallelize(Seq(1,2)). flatMap(x => Stream.range(1, expand_size). map(y => "%d: string test %d".format(y,x))) var persisted = data.persist(StorageLevel.MEMORY_ONLY_SER) assert(persisted.filter(_.startsWith("1:")).count()===2) } }
Example 55
Source File: PartitionwiseSampledRDDSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.scalatest.FunSuite import org.apache.spark.SharedSparkContext import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone = new MockSampler } class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 56
Source File: PartitionPruningRDDSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.scalatest.FunSuite import org.apache.spark.{Partition, SharedSparkContext, TaskContext} class PartitionPruningRDDSuite extends FunSuite with SharedSparkContext { test("Pruned Partitions inherit locality prefs correctly") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 1), new TestPartition(1, 1), new TestPartition(2, 1)) } def compute(split: Partition, context: TaskContext) = { Iterator() } } val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2) assert(prunedRDD.partitions.length == 1) val p = prunedRDD.partitions(0) assert(p.index == 0) assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2) } test("Pruned Partitions can be unioned ") { val rdd = new RDD[Int](sc, Nil) { override protected def getPartitions = { Array[Partition]( new TestPartition(0, 4), new TestPartition(1, 5), new TestPartition(2, 6)) } def compute(split: Partition, context: TaskContext) = { List(split.asInstanceOf[TestPartition].testValue).iterator } } val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0) val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2) val merged = prunedRDD1 ++ prunedRDD2 assert(merged.count() == 2) val take = merged.take(2) assert(take.apply(0) == 4) assert(take.apply(1) == 6) } } class TestPartition(i: Int, value: Int) extends Partition with Serializable { def index = i def testValue = this.value }
Example 57
Source File: ZippedPartitionsSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.SharedSparkContext import org.scalatest.FunSuite object ZippedPartitionsSuite { def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = { Iterator(i.toArray.size, s.toArray.size, d.toArray.size) } } class ZippedPartitionsSuite extends FunSuite with SharedSparkContext { test("print sizes") { val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2) val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2) val data3 = sc.makeRDD(Array(1.0, 2.0), 2) val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData) val obtainedSizes = zippedRDD.collect() val expectedSizes = Array(2, 3, 1, 2, 3, 1) assert(obtainedSizes.size == 6) assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2)) } }
Example 58
Source File: PythonBroadcastSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import java.io.{File, PrintWriter} import scala.io.Source import org.scalatest.Matchers import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite} import org.apache.spark.serializer.KryoSerializer import org.apache.spark.util.Utils // This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize // a PythonBroadcast: class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext { test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") { val tempDir = Utils.createTempDir() val broadcastedString = "Hello, world!" def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = { val source = Source.fromFile(broadcast.path) val contents = source.mkString source.close() contents should be (broadcastedString) } try { val broadcastDataFile: File = { val file = new File(tempDir, "broadcastData") val printWriter = new PrintWriter(file) printWriter.write(broadcastedString) printWriter.close() file } val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath) assertBroadcastIsValid(broadcast) val conf = new SparkConf().set("spark.kryo.registrationRequired", "true") val deserializedBroadcast = Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance()) assertBroadcastIsValid(deserializedBroadcast) } finally { Utils.deleteRecursively(tempDir) } } }
Example 59
Source File: SerDeUtilSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.api.python import org.apache.spark.{SharedSparkContext, SparkFunSuite} class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext { test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) SerDeUtil.pairRDDToPython(emptyRdd, 10) } test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") { val emptyRdd = sc.makeRDD(Seq[(Any, Any)]()) val javaRdd = emptyRdd.toJavaRDD() val pythonRdd = SerDeUtil.javaToPython(javaRdd) SerDeUtil.pythonToPairRDD(pythonRdd, false) } }
Example 60
Source File: GenericAvroSerializerSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import java.nio.ByteBuffer import com.esotericsoftware.kryo.io.{Input, Output} import org.apache.avro.{Schema, SchemaBuilder} import org.apache.avro.generic.GenericData.Record import org.apache.spark.{SharedSparkContext, SparkFunSuite} class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext { conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") val schema : Schema = SchemaBuilder .record("testRecord").fields() .requiredString("data") .endRecord() val record = new Record(schema) record.put("data", "test data") test("schema compression and decompression") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema)))) } test("record serialization and deserialization") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val outputStream = new ByteArrayOutputStream() val output = new Output(outputStream) genericSer.serializeDatum(record, output) output.flush() output.close() val input = new Input(new ByteArrayInputStream(outputStream.toByteArray)) assert(genericSer.deserializeDatum(input) === record) } test("uses schema fingerprint to decrease message size") { val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema) val output = new Output(new ByteArrayOutputStream()) val beginningNormalPosition = output.total() genericSerFull.serializeDatum(record, output) output.flush() val normalLength = output.total - beginningNormalPosition conf.registerAvroSchemas(schema) val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema) val beginningFingerprintPosition = output.total() genericSerFinger.serializeDatum(record, output) val fingerprintLength = output.total - beginningFingerprintPosition assert(fingerprintLength < normalLength) } test("caches previously seen schemas") { val genericSer = new GenericAvroSerializer(conf.getAvroSchema) val compressedSchema = genericSer.compress(schema) val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema)) assert(compressedSchema.eq(genericSer.compress(schema))) assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema)))) } }
Example 61
Source File: ProactiveClosureSerializationSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.serializer import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite} import org.apache.spark.rdd.RDD class UnserializableClass { def op[T](x: T): String = x.toString def pred[T](x: T): Boolean = x.toString.length % 2 == 0 } class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext { def fixture: (RDD[String], UnserializableClass) = { (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass) } test("throws expected serialization exceptions on actions") { val (data, uc) = fixture val ex = intercept[SparkException] { data.map(uc.op(_)).count() } assert(ex.getMessage.contains("Task not serializable")) } // There is probably a cleaner way to eliminate boilerplate here, but we're // iterating over a map from transformation names to functions that perform that // transformation on a given RDD, creating one test case for each for (transformation <- Map("map" -> xmap _, "flatMap" -> xflatMap _, "filter" -> xfilter _, "mapPartitions" -> xmapPartitions _, "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) { val (name, xf) = transformation test(s"$name transformations throw proactive serialization exceptions") { val (data, uc) = fixture val ex = intercept[SparkException] { xf(data, uc) } assert(ex.getMessage.contains("Task not serializable"), s"RDD.$name doesn't proactively throw NotSerializableException") } } private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.map(y => uc.op(y)) private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = x.flatMap(y => Seq(uc.op(y))) private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = x.filter(y => uc.pred(y)) private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitions(_.map(y => uc.op(y))) private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y))) }
Example 62
Source File: PartitionwiseSampledRDDSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(): Int = 1 override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }