org.apache.spark.mllib.random.RandomRDDs Scala Examples
The following examples show how to use org.apache.spark.mllib.random.RandomRDDs.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RandomRDDGeneration.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD object RandomRDDGeneration { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"RandomRDDGeneration") val sc = new SparkContext(conf) val numExamples = 10000 // number of examples to generate val fraction = 0.1 // fraction of data to sample // Example: RandomRDDs.normalRDD val normalRDD: RDD[Double] = RandomRDDs.normalRDD(sc, numExamples) println(s"Generated RDD of ${normalRDD.count()}" + " examples sampled from the standard normal distribution") println(" First 5 samples:") normalRDD.take(5).foreach( x => println(s" $x") ) // Example: RandomRDDs.normalVectorRDD val normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows = numExamples, numCols = 2) println(s"Generated RDD of ${normalVectorRDD.count()} examples of length-2 vectors.") println(" First 5 samples:") normalVectorRDD.take(5).foreach( x => println(s" $x") ) println() sc.stop() } } // scalastyle:on println
Example 2
Source File: TestLPSolver.scala From spark-lp with Apache License 2.0 | 5 votes |
object TestLPSolver { def main(args: Array[String]) { val rnd = new Random(12345) val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLPSolver") val sc = new SparkContext(sparkConf) val n = 1000 // Transpose constraint matrix row count. val m = 100 // Transpose constraint matrix column count. val numPartitions = 2 // Generate the starting vector from uniform distribution U(3.0, 5.0) println("generate x") val x0 = RandomRDDs.uniformRDD(sc, n, numPartitions).map(v => 3.0 + 2.0 * v).glom.map(new DenseVector(_)) // Generate the transpose constraint matrix 'B' using sparse uniformly generated values. println("generate B") val B = new RandomVectorRDD(sc, n, m, numPartitions, new SparseStandardNormalGenerator(0.1), rnd.nextLong) // Generate the cost vector 'c' using uniformly generated values. println("generate c") val c = RandomRDDs.uniformRDD(sc, n, numPartitions, rnd.nextLong).glom.map(new DenseVector(_)) // Compute 'b' using the starting 'x' vector. println("generate b") val b = (new LinopMatrixAdjoint(B))(x0) // Solve the linear program using LP.solve, finding the optimal x vector 'optimalX'. println("Start solving ...") val (optimalVal, _) = LP.solve(c, B, b, sc=sc) println("optimalVal: " + optimalVal) //println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 3
Source File: TestLASSO.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.examples import scala.util.Random import org.apache.spark.mllib.linalg.{ BLAS, DenseVector, Vectors } import org.apache.spark.mllib.optimization.tfocs.SolverL1RLS import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.{ SparkConf, SparkContext } object TestLASSO { def main(args: Array[String]) { val rnd = new Random(34324) val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLASSO") val sc = new SparkContext(sparkConf) val n = 1024 // Design matrix column count. val m = n / 2 // Design matrix row count. val k = m / 5 // Count of nonzero weights. // Generate the design matrix using random normal values, then normalize the columns. val unnormalizedA = RandomRDDs.normalVectorRDD(sc, m, n, 0, rnd.nextLong) val AColumnNormSq = unnormalizedA.treeAggregate(Vectors.zeros(n).toDense)( seqOp = (sum, rowA) => { val rowASq = Vectors.dense(rowA.toArray.map(rowA_i => rowA_i * rowA_i)) BLAS.axpy(1.0, rowASq, sum) sum }, combOp = (sum1, sum2) => { BLAS.axpy(1.0, sum2, sum1) sum1 }) val A = unnormalizedA.map(rowA => Vectors.dense(rowA.toArray.zip(AColumnNormSq.toArray).map { case (rowA_i, normsq_i) => rowA_i / math.sqrt(normsq_i) })) // Generate the actual 'x' vector, including 'k' nonzero values. val x = Vectors.zeros(n).toDense for (i <- rnd.shuffle(0 to n - 1).take(k)) { x.values(i) = rnd.nextGaussian } // Generate the 'b' vector using the design matrix and weights, adding gaussian noise. val bOriginal = new DenseVector(A.map(rowA => BLAS.dot(rowA, x)).collect) val snr = 30 // SNR in dB val sigma = math.pow(10, ((10 * math.log10(math.pow(Vectors.norm(bOriginal, 2), 2) / n) - snr) / 20)) val b = sc.parallelize(bOriginal.values.map(_ + sigma * rnd.nextGaussian)) .glom .map(new DenseVector(_)) // Set 'lambda' using the noise standard deviation. val lambda = 2 * sigma * math.sqrt(2 * math.log(n)) // Solve the lasso problem using SolverL1RLS, finding the estimated x vector 'estimatedX'. val (estimatedX, _) = SolverL1RLS.run(A, b, lambda) println("estimatedX: " + estimatedX.values.mkString(", ")) sc.stop() } }
Example 4
Source File: TestLinearProgram.scala From spark-tfocs with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.optimization.tfocs.examples import scala.util.Random import org.apache.spark.mllib.linalg.DenseVector import org.apache.spark.mllib.optimization.tfocs.DVectorFunctions._ import org.apache.spark.mllib.optimization.tfocs.SolverSLP import org.apache.spark.mllib.optimization.tfocs.fs.dvector.vector.LinopMatrixAdjoint import org.apache.spark.mllib.random.{ RandomDataGenerator, RandomRDDs } import org.apache.spark.mllib.rdd.RandomVectorRDD import org.apache.spark.{ SparkConf, SparkContext } import org.apache.spark.util.random.XORShiftRandom object TestLinearProgram { def main(args: Array[String]) { val rnd = new Random(34324) val sparkConf = new SparkConf().setMaster("local[2]").setAppName("TestLinearProgram") val sc = new SparkContext(sparkConf) val n = 5000 // Tranpose constraint matrix row count. val m = n / 2 // Transpose constrint matrix column count. // Generate a starting 'x' vector, using normally generated values. val x = RandomRDDs.normalRDD(sc, n).map(_ + 10).glom.map(new DenseVector(_)) // Generate the transpose constraint matrix 'A' using sparse normally generated values. val A = new RandomVectorRDD(sc, n, m, sc.defaultMinPartitions, new SparseStandardNormalGenerator(0.01), rnd.nextLong) // Generate the cost vector 'c' using normally generated values. val c = RandomRDDs.normalRDD(sc, n, 0, rnd.nextLong).glom.map(new DenseVector(_)) // Compute 'b' using the starting 'x' vector. val b = new LinopMatrixAdjoint(A)(x) val mu = 1e-2 // Solve the linear program using SolverSLP, finding the optimal x vector 'optimalX'. val (optimalX, _) = SolverSLP.run(c, A, b, mu) println("optimalX: " + optimalX.collectElements.mkString(", ")) sc.stop() } }
Example 5
Source File: WeightedLabeledPoint.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.math.stats.regression import breeze.linalg.DenseVector import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext case class WeightedLabeledPoint(label: Double, weight: Double, features: DenseVector[Double]) { def generateSampleData(sc: SparkContext, weights: DenseVector[Double], intercept: Double, numRows: Long = 100L, numPartitions: Int = 4, errorScalar: Double = 1.0, seed: Long = 1L): RDD[WeightedLabeledPoint] = { val len = weights.length + 2 // The last entry will serve as the weight of point and the second last entry will serve // as noisy of the label. val data = RandomRDDs.normalVectorRDD(sc, numRows, len, numPartitions, seed) data.map { d => val fw = d.toArray val x = new DenseVector(fw.dropRight(2)) WeightedLabeledPoint( weights.dot(x) + intercept + errorScalar * fw(len - 2), Math.abs(fw(len - 1)) + 0.5, x ) } } }
Example 6
Source File: VLBFGS1.scala From spark-vl-bfgs with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.optim import java.util.Random import scala.language.implicitConversions import org.apache.hadoop.fs.{FileSystem, Path} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.ml.optim.VectorFreeLBFGS.{Oracle, VectorSpace} import org.apache.spark.ml.optim.VectorRDDFunctions._ import org.apache.spark.mllib.linalg.{BLAS, Vector, Vectors} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.{RDD, UnionRDD} import org.apache.spark.storage.StorageLevel private def gradient(data: RDD[Array[LabeledPoint]], dx: RDD[Vector]): RDD[Vector] = { data.cartesian(dx).map { case (points, x) => val g = Vectors.zeros(x.size) points.foreach { case LabeledPoint(b, a) => val err = BLAS.dot(a, x) - b BLAS.axpy(err, a, g) } g }.treeSum() } def main(args: Array[String]): Unit = { val conf = new SparkConf().setAppName("VLBFGS").setMaster("local[*]") val sc = new SparkContext(conf) sc.setCheckpointDir("/tmp/checkpoint") val n = 1000 val p = 100 val random = new Random(0L) val xExact = Vectors.dense(Array.fill(p)(random.nextDouble())) val data = RandomRDDs.normalVectorRDD(sc, n, p, 4, 11L).mapPartitionsWithIndex { (idx, part) => val random = new Random(100 + idx) part.map { v => val target = BLAS.dot(v, xExact) + 0.1 * random.nextGaussian() LabeledPoint(target, v) } }.glom() .cache() val x = solve(data).first() println(s"x_exact = $xExact") println(s"x_vlbfgs = $x") sc.stop() } }
Example 7
Source File: RandomRDDGeneration.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD object RandomRDDGeneration { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"RandomRDDGeneration") val sc = new SparkContext(conf) val numExamples = 10000 // number of examples to generate val fraction = 0.1 // fraction of data to sample // Example: RandomRDDs.normalRDD val normalRDD: RDD[Double] = RandomRDDs.normalRDD(sc, numExamples) println(s"Generated RDD of ${normalRDD.count()}" + " examples sampled from the standard normal distribution") println(" First 5 samples:") normalRDD.take(5).foreach( x => println(s" $x") ) // Example: RandomRDDs.normalVectorRDD val normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows = numExamples, numCols = 2) println(s"Generated RDD of ${normalVectorRDD.count()} examples of length-2 vectors.") println(" First 5 samples:") normalVectorRDD.take(5).foreach( x => println(s" $x") ) println() sc.stop() } } // scalastyle:on println
Example 8
Source File: RandomRDDGeneration.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD object RandomRDDGeneration { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"RandomRDDGeneration") val sc = new SparkContext(conf) val numExamples = 10000 // number of examples to generate val fraction = 0.1 // fraction of data to sample // Example: RandomRDDs.normalRDD val normalRDD: RDD[Double] = RandomRDDs.normalRDD(sc, numExamples) println(s"Generated RDD of ${normalRDD.count()}" + " examples sampled from the standard normal distribution") println(" First 5 samples:") normalRDD.take(5).foreach( x => println(s" $x") ) // Example: RandomRDDs.normalVectorRDD val normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows = numExamples, numCols = 2) println(s"Generated RDD of ${normalVectorRDD.count()} examples of length-2 vectors.") println(" First 5 samples:") normalVectorRDD.take(5).foreach( x => println(s" $x") ) println() sc.stop() } } // scalastyle:on println
Example 9
Source File: DataSplitterTest.scala From TransmogrifAI with BSD 3-Clause "New" or "Revised" License | 5 votes |
package com.salesforce.op.stages.impl.tuning import com.salesforce.op.test.TestSparkContext import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.random.RandomRDDs import org.junit.runner.RunWith import org.scalatest.FlatSpec import org.scalatest.junit.JUnitRunner @RunWith(classOf[JUnitRunner]) class DataSplitterTest extends FlatSpec with TestSparkContext with SplitterSummaryAsserts { import spark.implicits._ val seed = 1234L val dataCount = 1000 val trainingLimitDefault = 1E6.toLong val data = RandomRDDs.normalVectorRDD(sc, 1000, 3, seed = seed) .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDF() val dataSplitter = DataSplitter(seed = seed) Spec[DataSplitter] should "split the data in the appropriate proportion - 0.0" in { val (train, test) = dataSplitter.setReserveTestFraction(0.0).split(data) test.count() shouldBe 0 train.count() shouldBe dataCount } it should "down-sample when the data count is above the default training limit" in { val numRows = trainingLimitDefault * 2 val data = RandomRDDs.normalVectorRDD(sc, numRows, 3, seed = seed) .map(v => (1.0, Vectors.dense(v.toArray), "A")).toDF() dataSplitter.preValidationPrepare(data) val dataBalanced = dataSplitter.validationPrepare(data) // validationPrepare calls the data sample method that samples the data to a target ratio but there is an epsilon // to how precise this function is which is why we need to check around that epsilon val samplingErrorEpsilon = (0.1 * trainingLimitDefault).toLong dataBalanced.count() shouldBe trainingLimitDefault +- samplingErrorEpsilon } it should "set and get all data splitter params" in { val maxRows = dataCount / 2 val downSampleFraction = maxRows / dataCount.toDouble val dataSplitter = DataSplitter() .setReserveTestFraction(0.0) .setSeed(seed) .setMaxTrainingSample(maxRows) .setDownSampleFraction(downSampleFraction) dataSplitter.getReserveTestFraction shouldBe 0.0 dataSplitter.getDownSampleFraction shouldBe downSampleFraction dataSplitter.getSeed shouldBe seed dataSplitter.getMaxTrainingSample shouldBe maxRows } it should "split the data in the appropriate proportion - 0.2" in { val (train, test) = dataSplitter.setReserveTestFraction(0.2).split(data) math.abs(test.count() - 200) < 30 shouldBe true math.abs(train.count() - 800) < 30 shouldBe true } it should "split the data in the appropriate proportion - 0.6" in { val (train, test) = dataSplitter.setReserveTestFraction(0.6).split(data) math.abs(test.count() - 600) < 30 shouldBe true math.abs(train.count() - 400) < 30 shouldBe true } it should "keep the data unchanged when prepare is called" in { val dataCount = data.count() val summary = dataSplitter.preValidationPrepare(data) val train = dataSplitter.validationPrepare(data) val sampleF = trainingLimitDefault / dataCount.toDouble val downSampleFraction = math.min(sampleF, 1.0) train.collect().zip(data.collect()).foreach { case (a, b) => a shouldBe b } assertDataSplitterSummary(summary.summaryOpt) { s => s shouldBe DataSplitterSummary(dataCount, downSampleFraction) } } }
Example 10
Source File: TestData.scala From sparkxgboost with Apache License 2.0 | 5 votes |
package rotationsymmetry.sxgboost import org.apache.spark.SparkContext import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD trait TestData { val simpleData = Seq( LabeledPoint(0.1, Vectors.dense(0, 0)), LabeledPoint(0.2, Vectors.dense(0, 1)), LabeledPoint(0.3, Vectors.dense(0, 2)), LabeledPoint(0.4, Vectors.dense(1, 0)), LabeledPoint(0.5, Vectors.dense(1, 1)), LabeledPoint(0.6, Vectors.dense(1, 2)) ) val simpleBinnedData = Seq( TreePoint(0.1, Array(0, 0)), TreePoint(0.2, Array(0, 1)), TreePoint(0.3, Array(0, 2)), TreePoint(0.4, Array(1, 0)), TreePoint(0.5, Array(1, 1)), TreePoint(0.6, Array(1, 2)) ) val simpleMetaData = new MetaData(2, Array(3, 4)) def randomLabelPointRDD( sc: SparkContext, numRows: Long, numCols: Int, numPartitions: Int, seed: Long): RDD[LabeledPoint] = { val featuresBundle = RandomRDDs.normalVectorRDD(sc, numRows, numCols, numPartitions, seed) val labels = RandomRDDs.normalRDD(sc, numRows, numPartitions, seed + 999) (labels zip featuresBundle).map { case (label, features) => LabeledPoint(label, features)} } }
Example 11
Source File: RandomRDDGeneration.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples.mllib import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD object RandomRDDGeneration { def main(args: Array[String]) { val conf = new SparkConf().setAppName(s"RandomRDDGeneration") val sc = new SparkContext(conf) val numExamples = 10000 // number of examples to generate val fraction = 0.1 // fraction of data to sample // Example: RandomRDDs.normalRDD val normalRDD: RDD[Double] = RandomRDDs.normalRDD(sc, numExamples) println(s"Generated RDD of ${normalRDD.count()}" + " examples sampled from the standard normal distribution") println(" First 5 samples:") normalRDD.take(5).foreach( x => println(s" $x") ) // Example: RandomRDDs.normalVectorRDD val normalVectorRDD = RandomRDDs.normalVectorRDD(sc, numRows = numExamples, numCols = 2) println(s"Generated RDD of ${normalVectorRDD.count()} examples of length-2 vectors.") println(" First 5 samples:") normalVectorRDD.take(5).foreach( x => println(s" $x") ) println() sc.stop() } } // scalastyle:on println
Example 12
Source File: SVDDataGenerator.scala From Swallow with Apache License 2.0 | 5 votes |
package com.intel.hibench.sparkbench.ml import com.intel.hibench.sparkbench.common.IOCommon import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.mllib.linalg.{Vectors,Vector} import org.apache.spark.rdd.RDD import org.apache.spark.mllib.linalg.distributed.RowMatrix import org.apache.spark.mllib.random.RandomRDDs object SVDDataGenerator { def generateDistributedRowMatrix( sc: SparkContext, m: Long, n: Int, numPartitions: Int, seed: Long = System.currentTimeMillis()): RDD[Vector] = { val data: RDD[Vector] = RandomRDDs.normalVectorRDD(sc, m, n, numPartitions, seed) data } def main(args: Array[String]) { val conf = new SparkConf().setAppName("SVDDataGenerator") val sc = new SparkContext(conf) var outputPath = "" var numExamples: Int = 200000 var numFeatures: Int = 20 val parallel = sc.getConf.getInt("spark.default.parallelism", sc.defaultParallelism) val numPartitions = IOCommon.getProperty("hibench.default.shuffle.parallelism") .getOrElse((parallel / 2).toString).toInt if (args.length == 3) { outputPath = args(0) numExamples = args(1).toInt numFeatures = args(2).toInt println(s"Output Path: $outputPath") println(s"Num of Examples: $numExamples") println(s"Num of Features: $numFeatures") } else { System.err.println( s"Usage: $SVDDataGenerator <OUTPUT_PATH> <NUM_EXAMPLES> <NUM_FEATURES>" ) System.exit(1) } val data = generateDistributedRowMatrix(sc, numExamples, numFeatures, numPartitions) data.saveAsObjectFile(outputPath) sc.stop() } }