org.apache.spark.util.random.XORShiftRandom Scala Example

Source File: BaggedPoint.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ml.tree.impl

import org.apache.commons.math3.distribution.PoissonDistribution

import org.apache.spark.rdd.RDD
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


  def convertToBaggedRDD[Datum] (
      input: RDD[Datum],
      subsamplingRate: Double,
      numSubsamples: Int,
      withReplacement: Boolean,
      seed: Long = Utils.random.nextLong()): RDD[BaggedPoint[Datum]] = {
    if (withReplacement) {
      convertToBaggedRDDSamplingWithReplacement(input, subsamplingRate, numSubsamples, seed)
    } else {
      if (numSubsamples == 1 && subsamplingRate == 1.0) {
        convertToBaggedRDDWithoutSampling(input)
      } else {
        convertToBaggedRDDSamplingWithoutReplacement(input, subsamplingRate, numSubsamples, seed)
      }
    }
  }

  private def convertToBaggedRDDSamplingWithoutReplacement[Datum] (
      input: RDD[Datum],
      subsamplingRate: Double,
      numSubsamples: Int,
      seed: Long): RDD[BaggedPoint[Datum]] = {
    input.mapPartitionsWithIndex { (partitionIndex, instances) =>
      // Use random seed = seed + partitionIndex + 1 to make generation reproducible.
      val rng = new XORShiftRandom
      rng.setSeed(seed + partitionIndex + 1)
      instances.map { instance =>
        val subsampleWeights = new Array[Double](numSubsamples)
        var subsampleIndex = 0
        while (subsampleIndex < numSubsamples) {
          val x = rng.nextDouble()
          subsampleWeights(subsampleIndex) = {
            if (x < subsamplingRate) 1.0 else 0.0
          }
          subsampleIndex += 1
        }
        new BaggedPoint(instance, subsampleWeights)
      }
    }
  }

  private def convertToBaggedRDDSamplingWithReplacement[Datum] (
      input: RDD[Datum],
      subsample: Double,
      numSubsamples: Int,
      seed: Long): RDD[BaggedPoint[Datum]] = {
    input.mapPartitionsWithIndex { (partitionIndex, instances) =>
      // Use random seed = seed + partitionIndex + 1 to make generation reproducible.
      val poisson = new PoissonDistribution(subsample)
      poisson.reseedRandomGenerator(seed + partitionIndex + 1)
      instances.map { instance =>
        val subsampleWeights = new Array[Double](numSubsamples)
        var subsampleIndex = 0
        while (subsampleIndex < numSubsamples) {
          subsampleWeights(subsampleIndex) = poisson.sample()
          subsampleIndex += 1
        }
        new BaggedPoint(instance, subsampleWeights)
      }
    }
  }

  private def convertToBaggedRDDWithoutSampling[Datum] (
      input: RDD[Datum]): RDD[BaggedPoint[Datum]] = {
    input.map(datum => new BaggedPoint(datum, Array(1.0)))
  }

}

Source File: randomExpressions.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode}
import org.apache.spark.sql.types.{DataType, DoubleType}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


@ExpressionDescription(
  usage = "_FUNC_(a) - Returns a random column with i.i.d. gaussian random distribution.")
case class Randn(seed: Long) extends RDG {
  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  def this() = this(Utils.random.nextLong())

  def this(seed: Expression) = this(seed match {
    case IntegerLiteral(s) => s
    case _ => throw new AnalysisException("Input argument to randn must be an integer literal.")
  })

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val rngTerm = ctx.freshName("rng")
    val className = classOf[XORShiftRandom].getName
    ctx.addMutableState(className, rngTerm,
      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
    ev.copy(code = s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""", isNull = "false")
  }
}

Source File: TestLPSolver.scala From spark-lp with Apache License 2.0

5 votes

object TestLPSolver {
  def main(args: Array[String]) {

    val rnd = new Random(12345)
    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLPSolver")
    val sc = new SparkContext(sparkConf)

    val n = 1000 // Transpose constraint matrix row count.
    val m = 100 // Transpose constraint matrix column count.
    val numPartitions = 2

    // Generate the starting vector from uniform distribution U(3.0, 5.0)
    println("generate x")
    val x0 = RandomRDDs.uniformRDD(sc, n, numPartitions).map(v => 3.0 + 2.0 * v).glom.map(new DenseVector(_))

    // Generate the transpose constraint matrix 'B' using sparse uniformly generated values.
    println("generate B")
    val B = new RandomVectorRDD(sc,
      n,
      m,
      numPartitions,
      new SparseStandardNormalGenerator(0.1),
      rnd.nextLong)

    // Generate the cost vector 'c' using uniformly generated values.
    println("generate c")
    val c = RandomRDDs.uniformRDD(sc, n, numPartitions, rnd.nextLong).glom.map(new DenseVector(_))
    // Compute 'b' using the starting 'x' vector.
    println("generate b")
    val b = (new LinopMatrixAdjoint(B))(x0)

    // Solve the linear program using LP.solve, finding the optimal x vector 'optimalX'.
    println("Start solving ...")
    val (optimalVal, _) = LP.solve(c, B, b, sc=sc)
    println("optimalVal: " + optimalVal)
    //println("optimalX: " + optimalX.collectElements.mkString(", "))

    sc.stop()
  }
}

Source File: TestLinearProgram.scala From spark-tfocs with Apache License 2.0

5 votes

package org.apache.spark.mllib.optimization.tfocs.examples

import scala.util.Random

import org.apache.spark.mllib.linalg.DenseVector
import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._
import org.apache.spark.mllib.optimization.tfocs.SolverSLP
import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint
import org.apache.spark.mllib.random.{ RandomDataGenerator, RandomRDDs }
import org.apache.spark.mllib.rdd.RandomVectorRDD
import org.apache.spark.{ SparkConf, SparkContext }
import org.apache.spark.util.random.XORShiftRandom


object TestLinearProgram {
  def main(args: Array[String]) {

    val rnd = new Random(34324)
    val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLinearProgram")
    val sc = new SparkContext(sparkConf)

    val n = 5000 // Tranpose constraint matrix row count.
    val m = n / 2 // Transpose constrint matrix column count.

    // Generate a starting 'x' vector, using normally generated values.
    val x = RandomRDDs.normalRDD(sc, n).map(_ + 10).glom.map(new DenseVector(_))

    // Generate the transpose constraint matrix 'A' using sparse normally generated values.
    val A = new RandomVectorRDD(sc,
      n,
      m,
      sc.defaultMinPartitions,
      new SparseStandardNormalGenerator(0.01),
      rnd.nextLong)

    // Generate the cost vector 'c' using normally generated values.
    val c = RandomRDDs.normalRDD(sc, n, 0, rnd.nextLong).glom.map(new DenseVector(_))

    // Compute 'b' using the starting 'x' vector.
    val b = new LinopMatrixAdjoint(A)(x)

    val mu = 1e-2

    // Solve the linear program using SolverSLP, finding the optimal x vector 'optimalX'.
    val (optimalX, _) = SolverSLP.run(c, A, b, mu)
    println("optimalX: " + optimalX.collectElements.mkString(", "))

    sc.stop()
  }
}

Source File: randomExpressions.scala From XSQL with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, CodeGenerator, ExprCode, FalseLiteral}
import org.apache.spark.sql.catalyst.expressions.codegen.Block._
import org.apache.spark.sql.types._
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


// scalastyle:off line.size.limit
@ExpressionDescription(
  usage = """_FUNC_([seed]) - Returns a random value with independent and identically distributed (i.i.d.) values drawn from the standard normal distribution.""",
  examples = """
    Examples:
      > SELECT _FUNC_();
       -0.3254147983080288
      > SELECT _FUNC_(0);
       1.1164209726833079
      > SELECT _FUNC_(null);
       1.1164209726833079
  """,
  note = "The function is non-deterministic in general case.")
// scalastyle:on line.size.limit
case class Randn(child: Expression) extends RDG with ExpressionWithRandomSeed {

  def this() = this(Literal(Utils.random.nextLong(), LongType))

  override def withNewSeed(seed: Long): Randn = Randn(Literal(seed, LongType))

  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = {
    val className = classOf[XORShiftRandom].getName
    val rngTerm = ctx.addMutableState(className, "rng")
    ctx.addPartitionInitializationStatement(
      s"$rngTerm = new $className(${seed}L + partitionIndex);")
    ev.copy(code = code"""
      final ${CodeGenerator.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();""",
      isNull = FalseLiteral)
  }

  override def freshCopy(): Randn = Randn(child)
}

object Randn {
  def apply(seed: Long): Randn = Randn(Literal(seed, LongType))
}

Source File: Vector.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.util

import scala.language.implicitConversions
import scala.util.Random

import org.apache.spark.util.random.XORShiftRandom

@deprecated("Use Vectors.dense from Spark's mllib.linalg package instead.", "1.0.0")
class Vector(val elements: Array[Double]) extends Serializable {
  def length: Int = elements.length

  def apply(index: Int): Double = elements(index)

  def + (other: Vector): Vector = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    Vector(length, i => this(i) + other(i))
  }

  def add(other: Vector): Vector = this + other

  def - (other: Vector): Vector = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    Vector(length, i => this(i) - other(i))
  }

  def subtract(other: Vector): Vector = this - other

  def dot(other: Vector): Double = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    var ans = 0.0
    var i = 0
    while (i < length) {
      ans += this(i) * other(i)
      i += 1
    }
    ans
  }

  
  def random(length: Int, random: Random = new XORShiftRandom()): Vector =
    Vector(length, _ => random.nextDouble())

  class Multiplier(num: Double) {
    def * (vec: Vector): Vector = vec * num
  }

  implicit def doubleToMultiplier(num: Double): Multiplier = new Multiplier(num)

  implicit object VectorAccumParam extends org.apache.spark.AccumulatorParam[Vector] {
    def addInPlace(t1: Vector, t2: Vector): Vector = t1 + t2

    def zero(initialValue: Vector): Vector = Vector.zeros(initialValue.length)
  }

}

Source File: Vector.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.util

import scala.language.implicitConversions
import scala.util.Random

import org.apache.spark.util.random.XORShiftRandom

@deprecated("Use Vectors.dense from Spark's mllib.linalg package instead.", "1.0.0")
class Vector(val elements: Array[Double]) extends Serializable {
  def length: Int = elements.length

  def apply(index: Int): Double = elements(index)

  def + (other: Vector): Vector = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    Vector(length, i => this(i) + other(i))
  }

  def add(other: Vector): Vector = this + other

  def - (other: Vector): Vector = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    Vector(length, i => this(i) - other(i))
  }

  def subtract(other: Vector): Vector = this - other

  def dot(other: Vector): Double = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    var ans = 0.0
    var i = 0
    while (i < length) {
      ans += this(i) * other(i)
      i += 1
    }
    ans
  }

  
  def random(length: Int, random: Random = new XORShiftRandom()): Vector =
    Vector(length, _ => random.nextDouble())

  class Multiplier(num: Double) {
    def * (vec: Vector): Vector = vec * num
  }

  implicit def doubleToMultiplier(num: Double): Multiplier = new Multiplier(num)

  implicit object VectorAccumParam extends org.apache.spark.AccumulatorParam[Vector] {
    def addInPlace(t1: Vector, t2: Vector): Vector = t1 + t2

    def zero(initialValue: Vector): Vector = Vector.zeros(initialValue.length)
  }

}

Source File: randomExpressions.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types.{DataType, DoubleType}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


case class Randn(seed: Long) extends RDG {
  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  def this() = this(Utils.random.nextLong())

  def this(seed: Expression) = this(seed match {
    case IntegerLiteral(s) => s
    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
  })

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val rngTerm = ctx.freshName("rng")
    val className = classOf[XORShiftRandom].getName
    ctx.addMutableState(className, rngTerm,
      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
    ev.isNull = "false"
    s"""
      final ${ctx.javaType(dataType)} ${ev.primitive} = $rngTerm.nextGaussian();
    """
  }
}

Source File: Vector.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.util

import scala.language.implicitConversions
import scala.util.Random

import org.apache.spark.util.random.XORShiftRandom

@deprecated("Use Vectors.dense from Spark's mllib.linalg package instead.", "1.0.0")
class Vector(val elements: Array[Double]) extends Serializable {
  def length: Int = elements.length

  def apply(index: Int): Double = elements(index)

  def + (other: Vector): Vector = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    Vector(length, i => this(i) + other(i))
  }

  def add(other: Vector): Vector = this + other

  def - (other: Vector): Vector = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    Vector(length, i => this(i) - other(i))
  }

  def subtract(other: Vector): Vector = this - other

  def dot(other: Vector): Double = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    var ans = 0.0
    var i = 0
    while (i < length) {
      ans += this(i) * other(i)
      i += 1
    }
    ans
  }

  
  def random(length: Int, random: Random = new XORShiftRandom()): Vector =
    Vector(length, _ => random.nextDouble())

  class Multiplier(num: Double) {
    def * (vec: Vector): Vector = vec * num
  }

  implicit def doubleToMultiplier(num: Double): Multiplier = new Multiplier(num)

  implicit object VectorAccumParam extends org.apache.spark.AccumulatorParam[Vector] {
    def addInPlace(t1: Vector, t2: Vector): Vector = t1 + t2

    def zero(initialValue: Vector): Vector = Vector.zeros(initialValue.length)
  }

}

Source File: randomExpressions.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.catalyst.expressions

import org.apache.spark.TaskContext
import org.apache.spark.sql.AnalysisException
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.codegen.{CodeGenContext, GeneratedExpressionCode}
import org.apache.spark.sql.types.{DataType, DoubleType}
import org.apache.spark.util.Utils
import org.apache.spark.util.random.XORShiftRandom


case class Randn(seed: Long) extends RDG {
  override protected def evalInternal(input: InternalRow): Double = rng.nextGaussian()

  def this() = this(Utils.random.nextLong())

  def this(seed: Expression) = this(seed match {
    case IntegerLiteral(s) => s
    case _ => throw new AnalysisException("Input argument to rand must be an integer literal.")
  })

  override def genCode(ctx: CodeGenContext, ev: GeneratedExpressionCode): String = {
    val rngTerm = ctx.freshName("rng")
    val className = classOf[XORShiftRandom].getName
    ctx.addMutableState(className, rngTerm,
      s"$rngTerm = new $className(${seed}L + org.apache.spark.TaskContext.getPartitionId());")
    ev.isNull = "false"
    s"""
      final ${ctx.javaType(dataType)} ${ev.value} = $rngTerm.nextGaussian();
    """
  }
}

Source File: Vector.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.util

import scala.language.implicitConversions
import scala.util.Random

import org.apache.spark.util.random.XORShiftRandom

@deprecated("Use Vectors.dense from Spark's mllib.linalg package instead.", "1.0.0")
class Vector(val elements: Array[Double]) extends Serializable {
  def length: Int = elements.length

  def apply(index: Int): Double = elements(index)

  def + (other: Vector): Vector = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    Vector(length, i => this(i) + other(i))
  }

  def add(other: Vector): Vector = this + other

  def - (other: Vector): Vector = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    Vector(length, i => this(i) - other(i))
  }

  def subtract(other: Vector): Vector = this - other

  def dot(other: Vector): Double = {
    if (length != other.length) {
      throw new IllegalArgumentException("Vectors of different length")
    }
    var ans = 0.0
    var i = 0
    while (i < length) {
      ans += this(i) * other(i)
      i += 1
    }
    ans
  }

  
  def random(length: Int, random: Random = new XORShiftRandom()): Vector =
    Vector(length, _ => random.nextDouble())

  class Multiplier(num: Double) {
    def * (vec: Vector): Vector = vec * num
  }

  implicit def doubleToMultiplier(num: Double): Multiplier = new Multiplier(num)

  implicit object VectorAccumParam extends org.apache.spark.AccumulatorParam[Vector] {
    def addInPlace(t1: Vector, t2: Vector): Vector = t1 + t2

    def zero(initialValue: Vector): Vector = Vector.zeros(initialValue.length)
  }

}

org.apache.spark.util.random.XORShiftRandom Scala Examples