org.apache.spark.util.random.RandomSampler Scala Examples
The following examples show how to use org.apache.spark.util.random.RandomSampler.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: PartitionwiseSampledRDDSuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 2
Source File: WeightedRandomSampler.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.graph.utils import java.util.Random import org.apache.spark.util.SparkUtil import org.apache.spark.util.random.RandomSampler import scala.reflect.ClassTag abstract class WeightedRandomSampler[T: ClassTag, U: ClassTag] extends RandomSampler[(T, Float), U] { protected var fraction = 0.0 override def sample(items: Iterator[(T, Float)]): Iterator[U] = { items.filter(x => sample(x._2) > 0).asInstanceOf[Iterator[U]] } def sample(weight: Float): Int override def sample(): Int = ??? def setFraction(fraction: Double): Unit = { require( fraction >= (0.0 - 1e-6) && fraction <= (1.0 + 1e-6), s"Sampling fraction ($fraction) must be on interval [0, 1]") this.fraction = fraction } override def clone: WeightedRandomSampler[T, U] = ??? } class NaiveWeightedBernoulliSampler[T: ClassTag] extends WeightedRandomSampler[T, (T, Float)] { private val rng: Random = SparkUtil.getXORShiftRandom(System.nanoTime) override def setSeed(seed: Long): Unit = rng.setSeed(seed) def sample(weight: Float): Int = { if (fraction <= 0.0) { 0 } else if (fraction >= 1.0) { 1 } else { if (rng.nextDouble() <= fraction * weight) { 1 } else { 0 } } } override def clone: NaiveWeightedBernoulliSampler[T] = new NaiveWeightedBernoulliSampler[T] }
Example 3
Source File: PartitionwiseSampledRDD.scala From sona with Apache License 2.0 | 5 votes |
package com.tencent.angel.sona.ml.rdd import java.util.Random import org.apache.spark.{Partition, TaskContext} import org.apache.spark.rdd.RDD import org.apache.spark.util.random.RandomSampler import scala.reflect.ClassTag private[sona] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } /** * An RDD sampled from its parent RDD partition-wise. For each partition of the parent RDD, * a user-specified [[org.apache.spark.util.random.RandomSampler]] instance is used to obtain * a random sample of the records in the partition. The random seeds assigned to the samplers * are guaranteed to have different values. * * @param prev RDD to be sampled * @param sampler a random sampler * @param preservesPartitioning whether the sampler preserves the partitioner of the parent RDD * @param seed random seed * @tparam T input RDD item type * @tparam U sampled RDD item type */ private[sona] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = (new Random).nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 4
Source File: PartitionwiseSampledRDDSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 5
Source File: PartitionwiseSampledRDD.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 6
Source File: PartitionwiseSampledRDDSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(): Int = 1 override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 7
Source File: PartitionwiseSampledRDD.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.Utils import org.apache.spark.util.random.RandomSampler private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 8
Source File: PartitionwiseSampledRDDSuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") {//分布式种子 val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") {//并发 // SPARK-2251: zip with self computes each partition twice. //用自己计算每个分区的两倍 // We want to make sure there are no concurrency issues. //我们要确保没有并发问题 val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 9
Source File: PartitionwiseSampledRDD.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 10
Source File: PartitionwiseSampledRDD.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 11
Source File: PartitionwiseSampledRDD.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 12
Source File: PartitionwiseSampledRDDSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(): Int = 1 override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 13
Source File: PartitionwiseSampledRDD.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 14
Source File: PartitionwiseSampledRDDSuite.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.scalatest.FunSuite import org.apache.spark.SharedSparkContext import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone = new MockSampler } class PartitionwiseSampledRDDSuite extends FunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 15
Source File: PartitionwiseSampledRDD.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], @transient preservesPartitioning: Boolean, @transient seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 16
Source File: PartitionwiseSampledRDDSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(): Int = 1 override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }
Example 17
Source File: PartitionwiseSampledRDD.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import java.util.Random import scala.reflect.ClassTag import org.apache.spark.{Partition, TaskContext} import org.apache.spark.util.random.RandomSampler import org.apache.spark.util.Utils private[spark] class PartitionwiseSampledRDDPartition(val prev: Partition, val seed: Long) extends Partition with Serializable { override val index: Int = prev.index } private[spark] class PartitionwiseSampledRDD[T: ClassTag, U: ClassTag]( prev: RDD[T], sampler: RandomSampler[T, U], preservesPartitioning: Boolean, @transient private val seed: Long = Utils.random.nextLong) extends RDD[U](prev) { @transient override val partitioner = if (preservesPartitioning) prev.partitioner else None override def getPartitions: Array[Partition] = { val random = new Random(seed) firstParent[T].partitions.map(x => new PartitionwiseSampledRDDPartition(x, random.nextLong())) } override def getPreferredLocations(split: Partition): Seq[String] = firstParent[T].preferredLocations(split.asInstanceOf[PartitionwiseSampledRDDPartition].prev) override def compute(splitIn: Partition, context: TaskContext): Iterator[U] = { val split = splitIn.asInstanceOf[PartitionwiseSampledRDDPartition] val thisSampler = sampler.clone thisSampler.setSeed(split.seed) thisSampler.sample(firstParent[T].iterator(split.prev, context)) } }
Example 18
Source File: PartitionwiseSampledRDDSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.rdd import org.apache.spark.{SharedSparkContext, SparkFunSuite} import org.apache.spark.util.random.{BernoulliSampler, PoissonSampler, RandomSampler} class MockSampler extends RandomSampler[Long, Long] { private var s: Long = _ override def setSeed(seed: Long) { s = seed } override def sample(): Int = 1 override def sample(items: Iterator[Long]): Iterator[Long] = { Iterator(s) } override def clone: MockSampler = new MockSampler } class PartitionwiseSampledRDDSuite extends SparkFunSuite with SharedSparkContext { test("seed distribution") { val rdd = sc.makeRDD(Array(1L, 2L, 3L, 4L), 2) val sampler = new MockSampler val sample = new PartitionwiseSampledRDD[Long, Long](rdd, sampler, false, 0L) assert(sample.distinct().count == 2, "Seeds must be different.") } test("concurrency") { // SPARK-2251: zip with self computes each partition twice. // We want to make sure there are no concurrency issues. val rdd = sc.parallelize(0 until 111, 10) for (sampler <- Seq(new BernoulliSampler[Int](0.5), new PoissonSampler[Int](0.5))) { val sampled = new PartitionwiseSampledRDD[Int, Int](rdd, sampler, true) sampled.zip(sampled).count() } } }