org.apache.spark.SharedSparkContext Scala Example

Source File: SerDeUtilSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext {

  test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    SerDeUtil.pairRDDToPython(emptyRdd, 10)
  }

  test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    val javaRdd = emptyRdd.toJavaRDD()
    val pythonRdd = SerDeUtil.javaToPython(javaRdd)
    SerDeUtil.pythonToPairRDD(pythonRdd, false)
  }
}

Source File: ZippedPartitionsSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

object ZippedPartitionsSuite {
  def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
  }
}

class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
  test("print sizes") {
    val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
    val data3 = sc.makeRDD(Array(1.0, 2.0), 2)

    val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData)

    val obtainedSizes = zippedRDD.collect()
    val expectedSizes = Array(2, 3, 1, 2, 3, 1)
    assert(obtainedSizes.size == 6)
    assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
  }
}

Source File: PythonBroadcastSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.api.python

import scala.io.Source

import java.io.{PrintWriter, File}

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SerDeUtilSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext {

  test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    SerDeUtil.pairRDDToPython(emptyRdd, 10)
  }

  test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    val javaRdd = emptyRdd.toJavaRDD()
    val pythonRdd = SerDeUtil.javaToPython(javaRdd)
    SerDeUtil.pythonToPairRDD(pythonRdd, false)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: PartitionwiseSampledRDDSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}


class MockSampler extends RandomSampler[Long, Long] {

  private var s: Long = _

  override def setSeed(seed: Long) {
    s = seed
  }

  override def sample(items: Iterator[Long]): Iterator[Long] = {
    Iterator(s)
  }

  override def clone: MockSampler = new MockSampler
}

class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("seed distribution") {
    val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
    val sampler = new MockSampler
    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
    assert(sample.distinct().count == 2, "Seeds must be different.")
  }

  test("concurrency") {
    // SPARK-2251: zip with self computes each partition twice.
    // We want to make sure there are no concurrency issues.
    val rdd = sc.parallelize(0 until 111, 10)
    for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
      sampled.zip(sampled).count()
    }
  }
}

Source File: PartitionPruningRDDSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
}

Source File: ZippedPartitionsSuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

object ZippedPartitionsSuite {
  def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
  }
}

class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
  test("print sizes") {
    val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
    val data3 = sc.makeRDD(Array(1.0, 2.0), 2)

    val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData)

    val obtainedSizes = zippedRDD.collect()
    val expectedSizes = Array(2, 3, 1, 2, 3, 1)
    assert(obtainedSizes.size == 6)
    assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
  }
}

Source File: PythonBroadcastSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.api.python

import scala.io.Source

import java.io.{PrintWriter, File}

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SerDeUtilSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext {
  //将空对RDD转换为python不会引发异常
  test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    SerDeUtil.pairRDDToPython(emptyRdd, 10)
  }
  //将空python RDD转换为RDD不会引发异常
  test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    val javaRdd = emptyRdd.toJavaRDD()
    val pythonRdd = SerDeUtil.javaToPython(javaRdd)
    SerDeUtil.pythonToPairRDD(pythonRdd, false)
  }
}

Source File: GenericAvroSerializerSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Output, Input}
import org.apache.avro.{SchemaBuilder, Schema}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SparkFunSuite, SharedSparkContext}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {//模式压缩与解压缩
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {//记录序列化和反序列化
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }
  //使用模式指纹以减少信息大小
  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {//缓存之前模式
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedScheam = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedScheam.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: ProactiveClosureSerializationSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }
//在一个活动的序列化异常,抛出预期的序列化异常
  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each
  //有可能是一个更清洁的方式来消除样板,
  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: PartitionwiseSampledRDDSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}


class MockSampler extends RandomSampler[Long, Long] {

  private var s: Long = _

  override def setSeed(seed: Long) {
    s = seed
  }

  override def sample(items: Iterator[Long]): Iterator[Long] = {
    Iterator(s)
  }

  override def clone: MockSampler = new MockSampler
}

class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("seed distribution") {//分布式种子
    val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
    val sampler = new MockSampler
    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
    assert(sample.distinct().count == 2, "Seeds must be different.")
  }

  test("concurrency") {//并发
    // SPARK-2251: zip with self computes each partition twice.
    //用自己计算每个分区的两倍
    // We want to make sure there are no concurrency issues.
    //我们要确保没有并发问题
    val rdd = sc.parallelize(0 until 111, 10)
    for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
      sampled.zip(sampled).count()
    }
  }
}

Source File: PartitionPruningRDDSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {//修剪的分区设置的正确性

    val rdd = new RDD[Int](sc, Nil) {//列表结尾为Nil
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {//修剪分区可以联合
  //列表结尾为Nil
    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
}

Source File: ZippedPartitionsSuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

object ZippedPartitionsSuite {
  def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
  }
}

class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
  test("print sizes") {//打印大小
    val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
    val data3 = sc.makeRDD(Array(1.0, 2.0), 2)

    val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData)

    val obtainedSizes = zippedRDD.collect()
    val expectedSizes = Array(2, 3, 1, 2, 3, 1)
    assert(obtainedSizes.size == 6)
    assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
  }
}

Source File: PythonBroadcastSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.api.python

import java.io.{File, PrintWriter}

import scala.io.Source

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: PartitionPruningRDDSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
}

Source File: GenericAvroSerializerSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Input, Output}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: ProactiveClosureSerializationSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: PartitionwiseSampledRDDSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}


class MockSampler extends RandomSampler[Long, Long] {

  private var s: Long = _

  override def setSeed(seed: Long) {
    s = seed
  }

  override def sample(): Int = 1

  override def sample(items: Iterator[Long]): Iterator[Long] = {
    Iterator(s)
  }

  override def clone: MockSampler = new MockSampler
}

class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("seed distribution") {
    val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
    val sampler = new MockSampler
    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
    assert(sample.distinct().count == 2, "Seeds must be different.")
  }

  test("concurrency") {
    // SPARK-2251: zip with self computes each partition twice.
    // We want to make sure there are no concurrency issues.
    val rdd = sc.parallelize(0 until 111, 10)
    for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
      sampled.zip(sampled).count()
    }
  }
}

Source File: PartitionPruningRDDSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
}

Source File: ZippedPartitionsSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

object ZippedPartitionsSuite {
  def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
  }
}

class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
  test("print sizes") {
    val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
    val data3 = sc.makeRDD(Array(1.0, 2.0), 2)

    val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData)

    val obtainedSizes = zippedRDD.collect()
    val expectedSizes = Array(2, 3, 1, 2, 3, 1)
    assert(obtainedSizes.size == 6)
    assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
  }
}

Source File: ChunkedByteBufferSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.io

import java.nio.ByteBuffer

import com.google.common.io.ByteStreams

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.internal.config
import org.apache.spark.network.util.ByteArrayWritableChannel
import org.apache.spark.util.io.ChunkedByteBuffer

class ChunkedByteBufferSuite extends SparkFunSuite with SharedSparkContext {

  test("no chunks") {
    val emptyChunkedByteBuffer = new ChunkedByteBuffer(Array.empty[ByteBuffer])
    assert(emptyChunkedByteBuffer.size === 0)
    assert(emptyChunkedByteBuffer.getChunks().isEmpty)
    assert(emptyChunkedByteBuffer.toArray === Array.empty)
    assert(emptyChunkedByteBuffer.toByteBuffer.capacity() === 0)
    assert(emptyChunkedByteBuffer.toNetty.capacity() === 0)
    emptyChunkedByteBuffer.toInputStream(dispose = false).close()
    emptyChunkedByteBuffer.toInputStream(dispose = true).close()
  }

  test("getChunks() duplicates chunks") {
    val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(8)))
    chunkedByteBuffer.getChunks().head.position(4)
    assert(chunkedByteBuffer.getChunks().head.position() === 0)
  }

  test("copy() does not affect original buffer's position") {
    val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(8)))
    chunkedByteBuffer.copy(ByteBuffer.allocate)
    assert(chunkedByteBuffer.getChunks().head.position() === 0)
  }

  test("writeFully() does not affect original buffer's position") {
    val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(8)))
    chunkedByteBuffer.writeFully(new ByteArrayWritableChannel(chunkedByteBuffer.size.toInt))
    assert(chunkedByteBuffer.getChunks().head.position() === 0)
  }

  test("SPARK-24107: writeFully() write buffer which is larger than bufferWriteChunkSize") {
    try {
      sc.conf.set(config.BUFFER_WRITE_CHUNK_SIZE, 32L * 1024L * 1024L)
      val chunkedByteBuffer = new ChunkedByteBuffer(Array(ByteBuffer.allocate(40 * 1024 * 1024)))
      val byteArrayWritableChannel = new ByteArrayWritableChannel(chunkedByteBuffer.size.toInt)
      chunkedByteBuffer.writeFully(byteArrayWritableChannel)
      assert(byteArrayWritableChannel.length() === chunkedByteBuffer.size)
    } finally {
      sc.conf.remove(config.BUFFER_WRITE_CHUNK_SIZE)
    }
  }

  test("toArray()") {
    val empty = ByteBuffer.wrap(Array.empty[Byte])
    val bytes = ByteBuffer.wrap(Array.tabulate(8)(_.toByte))
    val chunkedByteBuffer = new ChunkedByteBuffer(Array(bytes, bytes, empty))
    assert(chunkedByteBuffer.toArray === bytes.array() ++ bytes.array())
  }

  test("toArray() throws UnsupportedOperationException if size exceeds 2GB") {
    val fourMegabyteBuffer = ByteBuffer.allocate(1024 * 1024 * 4)
    fourMegabyteBuffer.limit(fourMegabyteBuffer.capacity())
    val chunkedByteBuffer = new ChunkedByteBuffer(Array.fill(1024)(fourMegabyteBuffer))
    assert(chunkedByteBuffer.size === (1024L * 1024L * 1024L * 4L))
    intercept[UnsupportedOperationException] {
      chunkedByteBuffer.toArray
    }
  }

  test("toInputStream()") {
    val empty = ByteBuffer.wrap(Array.empty[Byte])
    val bytes1 = ByteBuffer.wrap(Array.tabulate(256)(_.toByte))
    val bytes2 = ByteBuffer.wrap(Array.tabulate(128)(_.toByte))
    val chunkedByteBuffer = new ChunkedByteBuffer(Array(empty, bytes1, bytes2))
    assert(chunkedByteBuffer.size === bytes1.limit() + bytes2.limit())

    val inputStream = chunkedByteBuffer.toInputStream(dispose = false)
    val bytesFromStream = new Array[Byte](chunkedByteBuffer.size.toInt)
    ByteStreams.readFully(inputStream, bytesFromStream)
    assert(bytesFromStream === bytes1.array() ++ bytes2.array())
    assert(chunkedByteBuffer.getChunks().head.position() === 0)
  }
}

Source File: SQLContextSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class SQLContextSuite extends SparkFunSuite with SharedSparkContext{

  test("getOrCreate instantiates SQLContext") {
    val sqlContext = SQLContext.getOrCreate(sc)
    assert(sqlContext != null, "SQLContext.getOrCreate returned null")
    assert(SQLContext.getOrCreate(sc).eq(sqlContext),
      "SQLContext created by SQLContext.getOrCreate not returned by SQLContext.getOrCreate")
  }

  test("getOrCreate return the original SQLContext") {
    val sqlContext = SQLContext.getOrCreate(sc)
    val newSession = sqlContext.newSession()
    assert(SQLContext.getOrCreate(sc).eq(sqlContext),
      "SQLContext.getOrCreate after explicitly created SQLContext did not return the context")
    SQLContext.setActive(newSession)
    assert(SQLContext.getOrCreate(sc).eq(newSession),
      "SQLContext.getOrCreate after explicitly setActive() did not return the active context")
  }

  test("Sessions of SQLContext") {
    val sqlContext = SQLContext.getOrCreate(sc)
    val session1 = sqlContext.newSession()
    val session2 = sqlContext.newSession()

    // all have the default configurations
    val key = SQLConf.SHUFFLE_PARTITIONS.key
    assert(session1.getConf(key) === session2.getConf(key))
    session1.setConf(key, "1")
    session2.setConf(key, "2")
    assert(session1.getConf(key) === "1")
    assert(session2.getConf(key) === "2")

    // temporary table should not be shared
    val df = session1.range(10)
    df.registerTempTable("test1")
    assert(session1.tableNames().contains("test1"))
    assert(!session2.tableNames().contains("test1"))

    // UDF should not be shared
    def myadd(a: Int, b: Int): Int = a + b
    session1.udf.register[Int, Int, Int]("myadd", myadd)
    session1.sql("select myadd(1, 2)").explain()
    intercept[AnalysisException] {
      session2.sql("select myadd(1, 2)").explain()
    }
  }

  test("SPARK-13390: createDataFrame(java.util.List[_],Class[_]) NotSerializableException") {
    val rows = new java.util.ArrayList[IntJavaBean]()
    rows.add(new IntJavaBean(1))
    val sqlContext = SQLContext.getOrCreate(sc)
    // Without the fix for SPARK-13390, this will throw NotSerializableException
    sqlContext.createDataFrame(rows, classOf[IntJavaBean]).groupBy("int").count().collect()
  }
}

class IntJavaBean(private var i: Int) extends Serializable {

  def getInt(): Int = i

  def setInt(i: Int): Unit = {
    this.i = i
  }
}

Source File: PythonBroadcastSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.api.python

import scala.io.Source

import java.io.{PrintWriter, File}

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SerDeUtilSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext {

  test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    SerDeUtil.pairRDDToPython(emptyRdd, 10)
  }

  test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    val javaRdd = emptyRdd.toJavaRDD()
    val pythonRdd = SerDeUtil.javaToPython(javaRdd)
    SerDeUtil.pythonToPairRDD(pythonRdd, false)
  }
}

Source File: GenericAvroSerializerSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Output, Input}
import org.apache.avro.{SchemaBuilder, Schema}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SparkFunSuite, SharedSparkContext}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: ProactiveClosureSerializationSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: PartitionwiseSampledRDDSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}


class MockSampler extends RandomSampler[Long, Long] {

  private var s: Long = _

  override def setSeed(seed: Long) {
    s = seed
  }

  override def sample(items: Iterator[Long]): Iterator[Long] = {
    Iterator(s)
  }

  override def clone: MockSampler = new MockSampler
}

class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("seed distribution") {
    val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
    val sampler = new MockSampler
    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
    assert(sample.distinct().count == 2, "Seeds must be different.")
  }

  test("concurrency") {
    // SPARK-2251: zip with self computes each partition twice.
    // We want to make sure there are no concurrency issues.
    val rdd = sc.parallelize(0 until 111, 10)
    for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
      sampled.zip(sampled).count()
    }
  }
}

Source File: PartitionPruningRDDSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
}

Source File: ZippedPartitionsSuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

object ZippedPartitionsSuite {
  def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
  }
}

class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
  test("print sizes") {
    val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
    val data3 = sc.makeRDD(Array(1.0, 2.0), 2)

    val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData)

    val obtainedSizes = zippedRDD.collect()
    val expectedSizes = Array(2, 3, 1, 2, 3, 1)
    assert(obtainedSizes.size == 6)
    assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
  }
}

Source File: PartitionwiseSampledRDDSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}


class MockSampler extends RandomSampler[Long, Long] {

  private var s: Long = _

  override def setSeed(seed: Long) {
    s = seed
  }

  override def sample(): Int = 1

  override def sample(items: Iterator[Long]): Iterator[Long] = {
    Iterator(s)
  }

  override def clone: MockSampler = new MockSampler
}

class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("seed distribution") {
    val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
    val sampler = new MockSampler
    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
    assert(sample.distinct().count == 2, "Seeds must be different.")
  }

  test("concurrency") {
    // SPARK-2251: zip with self computes each partition twice.
    // We want to make sure there are no concurrency issues.
    val rdd = sc.parallelize(0 until 111, 10)
    for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
      sampled.zip(sampled).count()
    }
  }
}

Source File: SerDeUtilSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext {

  test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    SerDeUtil.pairRDDToPython(emptyRdd, 10)
  }

  test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    val javaRdd = emptyRdd.toJavaRDD()
    val pythonRdd = SerDeUtil.javaToPython(javaRdd)
    SerDeUtil.pythonToPairRDD(pythonRdd, false)
  }
}

Source File: GenericAvroSerializerSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Input, Output}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: ProactiveClosureSerializationSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: PartitionwiseSampledRDDSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}


class MockSampler extends RandomSampler[Long, Long] {

  private var s: Long = _

  override def setSeed(seed: Long) {
    s = seed
  }

  override def sample(): Int = 1

  override def sample(items: Iterator[Long]): Iterator[Long] = {
    Iterator(s)
  }

  override def clone: MockSampler = new MockSampler
}

class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("seed distribution") {
    val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
    val sampler = new MockSampler
    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
    assert(sample.distinct().count == 2, "Seeds must be different.")
  }

  test("concurrency") {
    // SPARK-2251: zip with self computes each partition twice.
    // We want to make sure there are no concurrency issues.
    val rdd = sc.parallelize(0 until 111, 10)
    for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
      sampled.zip(sampled).count()
    }
  }
}

Source File: PartitionPruningRDDSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
}

Source File: ZippedPartitionsSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

object ZippedPartitionsSuite {
  def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
  }
}

class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
  test("print sizes") {
    val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
    val data3 = sc.makeRDD(Array(1.0, 2.0), 2)

    val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData)

    val obtainedSizes = zippedRDD.collect()
    val expectedSizes = Array(2, 3, 1, 2, 3, 1)
    assert(obtainedSizes.size == 6)
    assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
  }
}

Source File: X2PSuite.scala From spark-tsne with Apache License 2.0

5 votes

package com.github.saurfang.spark.tsne

import org.apache.spark.SharedSparkContext
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.linalg.distributed.RowMatrix
import org.scalatest.{FunSuite, Matchers}


class X2PSuite extends FunSuite with SharedSparkContext with Matchers {

  test("Test X2P against tsne.jl implementation") {
    val input = new RowMatrix(
      sc.parallelize(Seq(1 to 3, 4 to 6, 7 to 9, 10 to 12))
        .map(x => Vectors.dense(x.map(_.toDouble).toArray))
    )
    val output = X2P(input, 1e-5, 2).toRowMatrix().rows.collect().map(_.toArray.toList)
    println(output.toList)
    //output shouldBe List(List(0, .5, .5), List(.5, 0, .5), List(.5, .5, .0))
  }
}

Source File: VirtualScreeningTest.scala From MaRe with Apache License 2.0

5 votes

package se.uu.it.mare

import java.io.File
import java.util.UUID

import scala.io.Source
import scala.util.Properties

import org.apache.spark.SharedSparkContext
import org.junit.runner.RunWith
import org.scalatest.FunSuite
import org.scalatest.junit.JUnitRunner

private object SDFUtils {
  def parseIDsAndScores(sdf: String): Array[(String, String)] = {
    sdf.split("\\n\\$\\$\\$\\$\\n").map { mol =>
      val lines = mol.split("\\n")
      (lines(0), lines.last)
    }
  }
}

@RunWith(classOf[JUnitRunner])
class VirtualScreeningTest extends FunSuite with SharedSparkContext {

  private val tmpDir = new File(Properties.envOrElse("TMPDIR", "/tmp"))

  test("Virtual Screening") {

    sc.hadoopConfiguration.set("textinputformat.record.delimiter", "\n$$$$\n")
    val mols = sc.textFile(getClass.getResource("sdf/molecules.sdf").getPath)

    // Parallel execution with MaRe
    val hitsParallel = new MaRe(mols)
      .map(
        inputMountPoint = TextFile("/input.sdf", "\n$$$$\n"),
        outputMountPoint = TextFile("/output.sdf", "\n$$$$\n"),
        imageName = "mcapuccini/oe:latest",
        command = "fred -receptor /var/openeye/hiv1_protease.oeb " +
          "-hitlist_size 0 " +
          "-conftest none " +
          "-dock_resolution Low " +
          "-dbase /input.sdf " +
          "-docked_molecule_file /output.sdf")
      .reduce(
        inputMountPoint = TextFile("/input.sdf", "\n$$$$\n"),
        outputMountPoint = TextFile("/output.sdf", "\n$$$$\n"),
        imageName = "mcapuccini/sdsorter:latest",
        command = "sdsorter -reversesort='FRED Chemgauss4 score' " +
          "-keep-tag='FRED Chemgauss4 score' " +
          "-nbest=30 " +
          "/input.sdf " +
          "/output.sdf")
      .rdd.collect.mkString("\n$$$$\n")

    // Serial execution
    val inputFile = new File(getClass.getResource("sdf/molecules.sdf").getPath)
    val dockedFile = new File(tmpDir, "mare_test_" + UUID.randomUUID.toString)
    dockedFile.createNewFile
    dockedFile.deleteOnExit
    val outputFile = new File(tmpDir, "mare_test_" + UUID.randomUUID.toString)
    outputFile.createNewFile
    outputFile.deleteOnExit
    DockerHelper.run(
      imageName = "mcapuccini/oe:latest",
      command = "fred -receptor /var/openeye/hiv1_protease.oeb " +
        "-hitlist_size 0 " +
        "-conftest none " +
        "-dock_resolution Low " +
        "-dbase /input.sdf " +
        "-docked_molecule_file /docked.sdf",
      bindFiles = Seq(inputFile, dockedFile),
      volumeFiles = Seq(new File("/input.sdf"), new File("/docked.sdf")),
      forcePull = false)
    DockerHelper.run(
      imageName = "mcapuccini/sdsorter:latest",
      command = "sdsorter -reversesort='FRED Chemgauss4 score' " +
        "-keep-tag='FRED Chemgauss4 score' " +
        "-nbest=30 " +
        "/docked.sdf " +
        "/output.sdf",
      bindFiles = Seq(dockedFile, outputFile),
      volumeFiles = Seq(new File("/docked.sdf"), new File("/output.sdf")),
      forcePull = false)
    val hitsSerial = Source.fromFile(outputFile).mkString

    // Test
    val parallel = SDFUtils.parseIDsAndScores(hitsParallel)
    val serial = SDFUtils.parseIDsAndScores(hitsSerial)
    assert(parallel.deep == serial.deep)

  }

}

Source File: LogisticRegressionTest.scala From spark-cp with Apache License 2.0

5 votes

package se.uu.farmbio.cp.alg

import org.apache.spark.SharedSparkContext
import org.junit.runner.RunWith
import org.scalatest.FunSuite
import se.uu.farmbio.cp.ICP
import se.uu.farmbio.cp.TestUtils
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class LogisticRegressionTest extends FunSuite with SharedSparkContext {

  test("test performance") {
    val trainData = TestUtils.generateBinaryData(100, 11)
    val testData = TestUtils.generateBinaryData(30, 22)
    val (calibration, properTrain) = ICP.calibrationSplit(sc.parallelize(trainData), 16)  
    val lr = new LogisticRegression(properTrain, 30)
    val model = ICP.trainClassifier(lr, numClasses=2, calibration)
    assert(TestUtils.testPerformance(model, sc.parallelize(testData)))
  }

}

Source File: SVMTest.scala From spark-cp with Apache License 2.0

5 votes

package se.uu.farmbio.cp.alg

import org.apache.spark.SharedSparkContext
import org.junit.runner.RunWith
import org.scalatest.FunSuite
import se.uu.farmbio.cp.ICP
import se.uu.farmbio.cp.TestUtils
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class SVMTest extends FunSuite with SharedSparkContext {

  test("test performance") {
    val trainData = TestUtils.generateBinaryData(100, 11)
    val testData = TestUtils.generateBinaryData(30, 22)
    val (calibration, properTrain) = ICP.calibrationSplit(sc.parallelize(trainData), 16)  
    val svm = new SVM(properTrain, 30)
    val model = ICP.trainClassifier(svm, numClasses=2, calibration)
    assert(TestUtils.testPerformance(model, sc.parallelize(testData)))
  }

}

Source File: GBTTest.scala From spark-cp with Apache License 2.0

5 votes

package se.uu.farmbio.cp.alg

import scala.util.Random
import org.apache.spark.SharedSparkContext
import org.junit.runner.RunWith
import org.scalatest.FunSuite
import se.uu.farmbio.cp.ICP
import se.uu.farmbio.cp.TestUtils
import org.scalatest.junit.JUnitRunner

@RunWith(classOf[JUnitRunner])
class GBTTest extends FunSuite with SharedSparkContext {
  
  Random.setSeed(11)

  test("test performance") {
    val trainData = TestUtils.generateBinaryData(100, 11)
    val testData = TestUtils.generateBinaryData(30, 22)
    val (calibration, properTrain) = ICP.calibrationSplit(sc.parallelize(trainData), 16)  
    val gbt = new GBT(properTrain, 30)
    val model = ICP.trainClassifier(gbt, numClasses=2, calibration)
    assert(TestUtils.testPerformance(model, sc.parallelize(testData)))
  }

}

Source File: PythonBroadcastSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.python

import java.io.{File, PrintWriter}

import scala.io.Source

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SerDeUtilSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext {

  test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    SerDeUtil.pairRDDToPython(emptyRdd, 10)
  }

  test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    val javaRdd = emptyRdd.toJavaRDD()
    val pythonRdd = SerDeUtil.javaToPython(javaRdd)
    SerDeUtil.pythonToPairRDD(pythonRdd, false)
  }
}

Source File: GenericAvroSerializerSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Input, Output}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: ProactiveClosureSerializationSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: PythonBroadcastSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.api.python

import java.io.{File, PrintWriter}

import scala.io.Source

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: PartitionPruningRDDSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{Partition, SharedSparkContext, SparkFunSuite, TaskContext}

class PartitionPruningRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index: Int = i
  def testValue: Int = this.value
}

Source File: ZippedPartitionsSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

object ZippedPartitionsSuite {
  def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
  }
}

class ZippedPartitionsSuite extends SparkFunSuite with SharedSparkContext {
  test("print sizes") {
    val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
    val data3 = sc.makeRDD(Array(1.0, 2.0), 2)

    val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData)

    val obtainedSizes = zippedRDD.collect()
    val expectedSizes = Array(2, 3, 1, 2, 3, 1)
    assert(obtainedSizes.size == 6)
    assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
  }
}

Source File: PythonBroadcastSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.api.python

import scala.io.Source

import java.io.{PrintWriter, File}

import org.scalatest.{Matchers, FunSuite}

import org.apache.spark.{SharedSparkContext, SparkConf}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends FunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SerDeUtilSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.scalatest.FunSuite

import org.apache.spark.SharedSparkContext

class SerDeUtilSuite extends FunSuite with SharedSparkContext {

  test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    SerDeUtil.pairRDDToPython(emptyRdd, 10)
  }

  test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    val javaRdd = emptyRdd.toJavaRDD()
    val pythonRdd = SerDeUtil.javaToPython(javaRdd)
    SerDeUtil.pythonToPairRDD(pythonRdd, false)
  }
}

Source File: ProactiveClosureSerializationSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.scalatest.FunSuite

import org.apache.spark.{SharedSparkContext, SparkException}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T) = x.toString
  
  def pred[T](x: T) = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends FunSuite with SharedSparkContext {

  def fixture = (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <- 
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"), 
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.map(y=>uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.flatMap(y=>Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.filter(y=>uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.mapPartitions(_.map(y=>uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] = 
    x.mapPartitionsWithIndex((_, it) => it.map(y=>uc.op(y)))
  
}

Source File: FlatmapIteratorSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.storage

import org.scalatest.FunSuite
import org.apache.spark.{SharedSparkContext, SparkConf, LocalSparkContext, SparkContext}


class FlatmapIteratorSuite extends FunSuite with LocalSparkContext {
  
  test("Flatmap Iterator to Disk") {
    val sconf = new SparkConf().setMaster("local").setAppName("iterator_to_disk_test")
    sc = new SparkContext(sconf)
    val expand_size = 100
    val data = sc.parallelize((1 to 5).toSeq).
      flatMap( x => Stream.range(0, expand_size))
    var persisted = data.persist(StorageLevel.DISK_ONLY)
    assert(persisted.count()===500)
    assert(persisted.filter(_==1).count()===5)
  }

  test("Flatmap Iterator to Memory") {
    val sconf = new SparkConf().setMaster("local").setAppName("iterator_to_disk_test")
    sc = new SparkContext(sconf)
    val expand_size = 100
    val data = sc.parallelize((1 to 5).toSeq).
      flatMap(x => Stream.range(0, expand_size))
    var persisted = data.persist(StorageLevel.MEMORY_ONLY)
    assert(persisted.count()===500)
    assert(persisted.filter(_==1).count()===5)
  }

  test("Serializer Reset") {
    val sconf = new SparkConf().setMaster("local").setAppName("serializer_reset_test")
      .set("spark.serializer.objectStreamReset", "10")
    sc = new SparkContext(sconf)
    val expand_size = 500
    val data = sc.parallelize(Seq(1,2)).
      flatMap(x => Stream.range(1, expand_size).
      map(y => "%d: string test %d".format(y,x)))
    var persisted = data.persist(StorageLevel.MEMORY_ONLY_SER)
    assert(persisted.filter(_.startsWith("1:")).count()===2)
  }

}

Source File: PartitionwiseSampledRDDSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.scalatest.FunSuite

import org.apache.spark.SharedSparkContext
import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}


class MockSampler extends RandomSampler[Long, Long] {

  private var s: Long = _

  override def setSeed(seed: Long) {
    s = seed
  }

  override def sample(items: Iterator[Long]): Iterator[Long] = {
    Iterator(s)
  }

  override def clone = new MockSampler
}

class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext {

  test("seed distribution") {
    val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
    val sampler = new MockSampler
    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
    assert(sample.distinct().count == 2, "Seeds must be different.")
  }

  test("concurrency") {
    // SPARK-2251: zip with self computes each partition twice.
    // We want to make sure there are no concurrency issues.
    val rdd = sc.parallelize(0 until 111, 10)
    for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
      sampled.zip(sampled).count()
    }
  }
}

Source File: PartitionPruningRDDSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.scalatest.FunSuite

import org.apache.spark.{Partition, SharedSparkContext, TaskContext}

class PartitionPruningRDDSuite extends FunSuite with SharedSparkContext {


  test("Pruned Partitions inherit locality prefs correctly") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 1),
          new TestPartition(1, 1),
          new TestPartition(2, 1))
      }

      def compute(split: Partition, context: TaskContext) = {
        Iterator()
      }
    }
    val prunedRDD = PartitionPruningRDD.create(rdd, _ == 2)
    assert(prunedRDD.partitions.length == 1)
    val p = prunedRDD.partitions(0)
    assert(p.index == 0)
    assert(p.asInstanceOf[PartitionPruningRDDPartition].parentSplit.index == 2)
  }


  test("Pruned Partitions can be unioned ") {

    val rdd = new RDD[Int](sc, Nil) {
      override protected def getPartitions = {
        Array[Partition](
          new TestPartition(0, 4),
          new TestPartition(1, 5),
          new TestPartition(2, 6))
      }

      def compute(split: Partition, context: TaskContext) = {
        List(split.asInstanceOf[TestPartition].testValue).iterator
      }
    }
    val prunedRDD1 = PartitionPruningRDD.create(rdd, _ == 0)


    val prunedRDD2 = PartitionPruningRDD.create(rdd, _ == 2)

    val merged = prunedRDD1 ++ prunedRDD2
    assert(merged.count() == 2)
    val take = merged.take(2)
    assert(take.apply(0) == 4)
    assert(take.apply(1) == 6)
  }
}

class TestPartition(i: Int, value: Int) extends Partition with Serializable {
  def index = i

  def testValue = this.value

}

Source File: ZippedPartitionsSuite.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.SharedSparkContext
import org.scalatest.FunSuite

object ZippedPartitionsSuite {
  def procZippedData(i: Iterator[Int], s: Iterator[String], d: Iterator[Double]) : Iterator[Int] = {
    Iterator(i.toArray.size, s.toArray.size, d.toArray.size)
  }
}

class ZippedPartitionsSuite extends FunSuite with SharedSparkContext {
  test("print sizes") {
    val data1 = sc.makeRDD(Array(1, 2, 3, 4), 2)
    val data2 = sc.makeRDD(Array("1", "2", "3", "4", "5", "6"), 2)
    val data3 = sc.makeRDD(Array(1.0, 2.0), 2)

    val zippedRDD = data1.zipPartitions(data2, data3)(ZippedPartitionsSuite.procZippedData)

    val obtainedSizes = zippedRDD.collect()
    val expectedSizes = Array(2, 3, 1, 2, 3, 1)
    assert(obtainedSizes.size == 6)
    assert(obtainedSizes.zip(expectedSizes).forall(x => x._1 == x._2))
  }
}

Source File: PythonBroadcastSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.python

import java.io.{File, PrintWriter}

import scala.io.Source

import org.scalatest.Matchers

import org.apache.spark.{SharedSparkContext, SparkConf, SparkFunSuite}
import org.apache.spark.serializer.KryoSerializer
import org.apache.spark.util.Utils

// This test suite uses SharedSparkContext because we need a SparkEnv in order to deserialize
// a PythonBroadcast:
class PythonBroadcastSuite extends SparkFunSuite with Matchers with SharedSparkContext {
  test("PythonBroadcast can be serialized with Kryo (SPARK-4882)") {
    val tempDir = Utils.createTempDir()
    val broadcastedString = "Hello, world!"
    def assertBroadcastIsValid(broadcast: PythonBroadcast): Unit = {
      val source = Source.fromFile(broadcast.path)
      val contents = source.mkString
      source.close()
      contents should be (broadcastedString)
    }
    try {
      val broadcastDataFile: File = {
        val file = new File(tempDir, "broadcastData")
        val printWriter = new PrintWriter(file)
        printWriter.write(broadcastedString)
        printWriter.close()
        file
      }
      val broadcast = new PythonBroadcast(broadcastDataFile.getAbsolutePath)
      assertBroadcastIsValid(broadcast)
      val conf = new SparkConf().set("spark.kryo.registrationRequired", "true")
      val deserializedBroadcast =
        Utils.clone[PythonBroadcast](broadcast, new KryoSerializer(conf).newInstance())
      assertBroadcastIsValid(deserializedBroadcast)
    } finally {
      Utils.deleteRecursively(tempDir)
    }
  }
}

Source File: SerDeUtilSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.api.python

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class SerDeUtilSuite extends SparkFunSuite with SharedSparkContext {

  test("Converting an empty pair RDD to python does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    SerDeUtil.pairRDDToPython(emptyRdd, 10)
  }

  test("Converting an empty python RDD to pair RDD does not throw an exception (SPARK-5441)") {
    val emptyRdd = sc.makeRDD(Seq[(Any, Any)]())
    val javaRdd = emptyRdd.toJavaRDD()
    val pythonRdd = SerDeUtil.javaToPython(javaRdd)
    SerDeUtil.pythonToPairRDD(pythonRdd, false)
  }
}

Source File: GenericAvroSerializerSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}
import java.nio.ByteBuffer

import com.esotericsoftware.kryo.io.{Input, Output}
import org.apache.avro.{Schema, SchemaBuilder}
import org.apache.avro.generic.GenericData.Record

import org.apache.spark.{SharedSparkContext, SparkFunSuite}

class GenericAvroSerializerSuite extends SparkFunSuite with SharedSparkContext {
  conf.set("spark.serializer", "org.apache.spark.serializer.KryoSerializer")

  val schema : Schema = SchemaBuilder
    .record("testRecord").fields()
    .requiredString("data")
    .endRecord()
  val record = new Record(schema)
  record.put("data", "test data")

  test("schema compression and decompression") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    assert(schema === genericSer.decompress(ByteBuffer.wrap(genericSer.compress(schema))))
  }

  test("record serialization and deserialization") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)

    val outputStream = new ByteArrayOutputStream()
    val output = new Output(outputStream)
    genericSer.serializeDatum(record, output)
    output.flush()
    output.close()

    val input = new Input(new ByteArrayInputStream(outputStream.toByteArray))
    assert(genericSer.deserializeDatum(input) === record)
  }

  test("uses schema fingerprint to decrease message size") {
    val genericSerFull = new GenericAvroSerializer(conf.getAvroSchema)

    val output = new Output(new ByteArrayOutputStream())

    val beginningNormalPosition = output.total()
    genericSerFull.serializeDatum(record, output)
    output.flush()
    val normalLength = output.total - beginningNormalPosition

    conf.registerAvroSchemas(schema)
    val genericSerFinger = new GenericAvroSerializer(conf.getAvroSchema)
    val beginningFingerprintPosition = output.total()
    genericSerFinger.serializeDatum(record, output)
    val fingerprintLength = output.total - beginningFingerprintPosition

    assert(fingerprintLength < normalLength)
  }

  test("caches previously seen schemas") {
    val genericSer = new GenericAvroSerializer(conf.getAvroSchema)
    val compressedSchema = genericSer.compress(schema)
    val decompressedSchema = genericSer.decompress(ByteBuffer.wrap(compressedSchema))

    assert(compressedSchema.eq(genericSer.compress(schema)))
    assert(decompressedSchema.eq(genericSer.decompress(ByteBuffer.wrap(compressedSchema))))
  }
}

Source File: ProactiveClosureSerializationSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.serializer

import org.apache.spark.{SharedSparkContext, SparkException, SparkFunSuite}
import org.apache.spark.rdd.RDD


class UnserializableClass {
  def op[T](x: T): String = x.toString

  def pred[T](x: T): Boolean = x.toString.length % 2 == 0
}

class ProactiveClosureSerializationSuite extends SparkFunSuite with SharedSparkContext {

  def fixture: (RDD[String], UnserializableClass) = {
    (sc.parallelize(0 until 1000).map(_.toString), new UnserializableClass)
  }

  test("throws expected serialization exceptions on actions") {
    val (data, uc) = fixture
    val ex = intercept[SparkException] {
      data.map(uc.op(_)).count()
    }
    assert(ex.getMessage.contains("Task not serializable"))
  }

  // There is probably a cleaner way to eliminate boilerplate here, but we're
  // iterating over a map from transformation names to functions that perform that
  // transformation on a given RDD, creating one test case for each

  for (transformation <-
      Map("map" -> xmap _,
          "flatMap" -> xflatMap _,
          "filter" -> xfilter _,
          "mapPartitions" -> xmapPartitions _,
          "mapPartitionsWithIndex" -> xmapPartitionsWithIndex _)) {
    val (name, xf) = transformation

    test(s"$name transformations throw proactive serialization exceptions") {
      val (data, uc) = fixture
      val ex = intercept[SparkException] {
        xf(data, uc)
      }
      assert(ex.getMessage.contains("Task not serializable"),
        s"RDD.$name doesn't proactively throw NotSerializableException")
    }
  }

  private def xmap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.map(y => uc.op(y))

  private def xflatMap(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.flatMap(y => Seq(uc.op(y)))

  private def xfilter(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.filter(y => uc.pred(y))

  private def xmapPartitions(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitions(_.map(y => uc.op(y)))

  private def xmapPartitionsWithIndex(x: RDD[String], uc: UnserializableClass): RDD[String] =
    x.mapPartitionsWithIndex((_, it) => it.map(y => uc.op(y)))

}

Source File: PartitionwiseSampledRDDSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.rdd

import org.apache.spark.{SharedSparkContext, SparkFunSuite}
import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler}


class MockSampler extends RandomSampler[Long, Long] {

  private var s: Long = _

  override def setSeed(seed: Long) {
    s = seed
  }

  override def sample(): Int = 1

  override def sample(items: Iterator[Long]): Iterator[Long] = {
    Iterator(s)
  }

  override def clone: MockSampler = new MockSampler
}

class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext {

  test("seed distribution") {
    val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2)
    val sampler = new MockSampler
    val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L)
    assert(sample.distinct().count == 2, "Seeds must be different.")
  }

  test("concurrency") {
    // SPARK-2251: zip with self computes each partition twice.
    // We want to make sure there are no concurrency issues.
    val rdd = sc.parallelize(0 until 111, 10)
    for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) {
      val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true)
      sampled.zip(sampled).count()
    }
  }
}

org.apache.spark.SharedSparkContext Scala Examples