breeze.linalg.Vector Scala Examples
The following examples show how to use breeze.linalg.Vector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkHdfsLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR").setMaster("local[2]") val inputPath = "D:\\spark\\spark-1.5.0-hadoop2.6\\data\\mllib\\lr_data.txt"//args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache()//缓存 val ITERATIONS = 6 //args(1).toInt 迭代次数 // Initialize w to a random value //初始化W到一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => //p代表DataPoint Vector p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 2
Source File: SparkLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println(s"Final w: $w") spark.stop() } } // scalastyle:on println
Example 3
Source File: LocalFileLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val fileSrc = scala.io.Source.fromFile(args(0)) val lines = fileSrc.getLines().toArray val points = lines.map(parsePoint) val ITERATIONS = args(1).toInt // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } fileSrc.close() println(s"Final w: $w") } } // scalastyle:on println
Example 4
Source File: SparkKMeans.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{squaredDistance, DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkKMeans") .getOrCreate() val lines = spark.read.textFile(args(0)).rdd val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println(s"Finished iteration (delta = $tempDist)") } println("Final centers:") kPoints.foreach(println) spark.stop() } } // scalastyle:on println
Example 5
Source File: LocalLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println(s"Final w: $w") } } // scalastyle:on println
Example 6
Source File: SparkHdfsLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") val y = tok.nextToken.toDouble val x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd lines.cache() val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println(s"Final w: $w") spark.stop() } } // scalastyle:on println
Example 7
Source File: SparkLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) //String.stripMargin 移除每行字符串开头的空格和第一个遇到的垂直分割符| } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR").setMaster("local") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value //将w初始化为一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 8
Source File: LocalFileLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions 维度 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) //解析每一行数据,生成DataPoint对像 def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() //fromFile读取文件,转换成Array[String] val lines = scala.io.Source.fromFile(args(0)).getLines().toArray //调用parsePoint解析每一行数据 val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value //初始化W到一个随机值数组 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 9
Source File: SparkKMeans.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42).toArray var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } } // scalastyle:on println
Example 10
Source File: LocalKMeans.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers(i) val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData val points = new HashSet[Vector[Double]] val kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println(s"Initial centers: $kPoints") while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val mappings = closest.groupBy[Int] (x => x._1) val pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints(mapping._1), mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println(s"Final centers: $kPoints") } } // scalastyle:on println
Example 11
Source File: SparkTachyonHdfsLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value 将w初始化为一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 12
Source File: Standard.scala From banditsbook-scala with MIT License | 5 votes |
package com.github.everpeace.banditsbook.algorithm.epsilon_greedy import breeze.linalg.argmax import breeze.stats.distributions.{Bernoulli, Rand, RandBasis} import breeze.storage.Zero import com.github.everpeace.banditsbook.algorithm.Algorithm import com.github.everpeace.banditsbook.arm.Arm import scala.collection.immutable.Seq import scala.reflect.ClassTag object Standard { import breeze.linalg.Vector import Vector._ case class State(ε: Double, counts: Vector[Int], expectations: Vector[Double]) def Algorithm(ε: Double)(implicit zeroDouble: Zero[Double], zeroInt: Zero[Int], tag: ClassTag[Double], rand: RandBasis = Rand) = new Algorithm[Double, State] { override def initialState(arms: Seq[Arm[Double]]): State = State(ε, zeros[Int](arms.size), zeros[Double](arms.size)) override def selectArm(arms: Seq[Arm[Double]], state: State): Int = Bernoulli.distribution(state.ε).draw() match { case true => // Exploit argmax(state.expectations) case false => // Explore Rand.randInt(state.expectations.size).draw() } override def updateState(arms: Seq[Arm[Double]], state: State, chosen: Int, reward: Double): State = { val counts = state.counts val expectations = state.expectations val count = counts(chosen) + 1 counts.update(chosen, count) val expectation = (((count - 1) / count.toDouble) * expectations(chosen)) + ((1 / count.toDouble) * reward) expectations.update(chosen, expectation) state.copy(counts = counts, expectations = expectations) } } }
Example 13
Source File: TracedAlgorithmDriver.scala From banditsbook-scala with MIT License | 5 votes |
package com.github.everpeace.banditsbook.algorithm import breeze.linalg.Vector import breeze.linalg.Vector._ import breeze.storage.Zero import cats.data.{State => CState} import com.github.everpeace.banditsbook.arm._ import scala.reflect.ClassTag object TracedAlgorithmDriver { // Note: breeze.linalg.Vector is mutable. final case class Trace[Reward: Zero](chosenArms: Vector[Int], counts: Vector[Int], rewards: Vector[Reward]) final case class State[Reward, AlgorithmState](arms: Seq[Arm[Reward]], step: Int, horizon: Int, algState: AlgorithmState, trace: Trace[Reward]) } case class TracedAlgorithmDriver[Reward: Zero: ClassTag, AlgorithmState](algo: Algorithm[Reward, AlgorithmState])(implicit zeroInt: Zero[Int]) { import CState._ import TracedAlgorithmDriver._ private val incrementStep = modify[State[Reward, AlgorithmState]] { s => s.copy(step = s.step + 1) } private def setAlgState(s: AlgorithmState) = modify[State[Reward, AlgorithmState]] { _.copy(algState = s) } private def updateTrace(a: Arm[Reward], r: Reward) = modify[State[Reward, AlgorithmState]] { s => val step = s.step val chosen = s.arms.indexOf(a) val count = s.trace.counts(chosen) s.trace.chosenArms.update(step, chosen) s.trace.counts.update(chosen, count + 1) s.trace.rewards.update(step, r) s.copy() } // drive 'step' once. private val driveStep: CState[State[Reward, AlgorithmState], Unit] = for { state <- get[State[Reward, AlgorithmState]] chosenArm = algo.selectArm.runA((state.arms, state.algState)).value reward = chosenArm.draw() newState = algo.updateState(chosenArm, reward).runA((state.arms, state.algState)).value _ <- setAlgState(newState) _ <- updateTrace(chosenArm, reward) _ <- incrementStep } yield () // drive 'step' $n times private def driveSteps(n: Int): CState[State[Reward, AlgorithmState], Unit] = n match { case 0 => pure( () ) // nop case _ => for { _ <- driveSteps(n - 1) _ <- driveStep } yield () } final def runFrom(state: State[Reward, AlgorithmState], steps: Int): State[Reward, AlgorithmState] = { if ((state.horizon - state.step) <= steps) driveSteps(state.horizon - state.step).runS(state).value else driveSteps(steps).runS(state).value } }
Example 14
Source File: NearestNeighbors.scala From SparkSMOTE with MIT License | 5 votes |
package utils import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import com.github.fommil.netlib.BLAS import scala.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.mutable.ArrayBuffer object NearestNeighbors { def runNearestNeighbors(data: RDD[Array[(LabeledPoint,Int,Int)]], kNN: Int, sampleData: Array[(LabeledPoint,Int,Int)]): Array[(String,Array[((Int,Int),Double)])] = { val globalNearestNeighborsByIndex = data.mapPartitionsWithIndex(localNearestNeighbors(_,_,kNN,sampleData)).groupByKey().map(x => (x._1,x._2.toArray.sortBy(r => r._2).take(kNN))).collect() globalNearestNeighborsByIndex } private def localNearestNeighbors(partitionIndex: Long, iter: Iterator[Array[(LabeledPoint,Int,Int)]], kNN: Int, sampleData: Array[(LabeledPoint,Int,Int)]): Iterator[(String,((Int,Int),Double))] = { var result = List[(String,((Int,Int),Double))]() val dataArr = iter.next val nLocal = dataArr.size - 1 val sampleDataSize = sampleData.size - 1 val kLocalNeighbors = Array.fill[distanceIndex](sampleDataSize+1)(null) for { i1 <- 0 to sampleDataSize } kLocalNeighbors(i1) = distanceIndex(sampleData(i1)._3.toInt, sampleData(i1)._2.toInt, DenseVector.zeros[Double](kNN) + Int.MaxValue.toDouble, DenseVector.zeros[Int](kNN)) for (i <- 0 to nLocal) { val currentPoint = dataArr(i) val features = currentPoint._1.features val rowId = currentPoint._3.toInt for (j <- 0 to sampleDataSize) { val samplePartitionId = sampleData(j)._2 val sampleRowId = sampleData(j)._3 val sampleFeatures = sampleData(j)._1.features if (!((rowId == sampleRowId) & (samplePartitionId == partitionIndex))) { val distance = Math.sqrt(sum((sampleFeatures - features) :* (sampleFeatures - features))) if (distance < max(kLocalNeighbors(j).distanceVector)) { val indexToReplace = argmax(kLocalNeighbors(j).distanceVector) kLocalNeighbors(j).distanceVector(indexToReplace) = distance kLocalNeighbors(j).neighborRowId(indexToReplace) = rowId } } } } for (m <- 0 to sampleDataSize){ for (l <-0 to kNN-1) { val key = kLocalNeighbors(m).partitionId.toString+","+kLocalNeighbors(m).sampleRowId.toString val tup = (partitionIndex.toInt,kLocalNeighbors(m).neighborRowId(l)) result.::=(key,(tup,kLocalNeighbors(m).distanceVector(l))) } } result.iterator } }
Example 15
Source File: loadData.scala From SparkSMOTE with MIT License | 5 votes |
package utils import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object loadData { def readDelimitedData(sc: SparkContext, path: String, numFeatures: Int, delimiter: String, numPartitions: Int): RDD[(LabeledPoint,Int,Int)] = { val data = sc.textFile(path).filter{x => x.split(delimiter)(0).toDouble == 1.0}.repartition(numPartitions).mapPartitions{x => Iterator(x.toArray)} val formatData = data.mapPartitionsWithIndex{(partitionId,iter) => var result = List[(LabeledPoint,Int,Int)]() val dataArray = iter.next val dataArraySize = dataArray.size - 1 var rowCount = dataArraySize for (i <- 0 to dataArraySize) { val parts = dataArray(i).split(delimiter) result.::=((LabeledPoint(parts(0).toDouble,DenseVector(parts.slice(1,numFeatures+1)).map(_.toDouble)),partitionId.toInt,rowCount)) rowCount = rowCount - 1 } result.iterator } formatData } }
Example 16
Source File: SMOTE.scala From SparkSMOTE with MIT License | 5 votes |
package SMOTE import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import com.github.fommil.netlib.BLAS import scala.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.mutable.ArrayBuffer import utils._ object SMOTE { def runSMOTE(sc: SparkContext, inPath: String, outPath: String, numFeatures: Int, oversamplingPctg: Double, kNN: Int, delimiter: String, numPartitions: Int): Unit = { val rand = new Random() val data = loadData.readDelimitedData(sc, inPath, numFeatures, delimiter, numPartitions) val dataArray = data.mapPartitions(x => Iterator(x.toArray)).cache() val numObs = dataArray.map(x => x.size).reduce(_+_) println("Number of Filtered Observations "+numObs.toString) val roundPctg = oversamplingPctg val sampleData = dataArray.flatMap(x => x).sample(withReplacement = false, fraction = roundPctg, seed = 1L).collect().sortBy(r => (r._2,r._3)) //without Replacement println("Sample Data Count "+sampleData.size.toString) val globalNearestNeighbors = NearestNeighbors.runNearestNeighbors(dataArray, kNN, sampleData) var randomNearestNeighbor = globalNearestNeighbors.map(x => (x._1.split(",")(0).toInt,x._1.split(",")(1).toInt,x._2(rand.nextInt(kNN)))).sortBy(r => (r._1,r._2)) var sampleDataNearestNeighbors = randomNearestNeighbor.zip(sampleData).map(x => (x._1._3._1._1, x._1._2, x._1._3._1._2, x._2._1)) val syntheticData = dataArray.mapPartitionsWithIndex(createSyntheticData(_,_,sampleDataNearestNeighbors,delimiter)).persist() println("Synthetic Data Count "+syntheticData.count.toString) val newData = syntheticData.union(sc.textFile(inPath)) println("New Line Count "+newData.count.toString) newData.saveAsTextFile(outPath) } private def createSyntheticData(partitionIndex: Long, iter: Iterator[Array[(LabeledPoint,Int,Int)]], sampleDataNN: Array[(Int,Int,Int,LabeledPoint)], delimiter: String): Iterator[String] = { var result = List[String]() val dataArr = iter.next val nLocal = dataArr.size - 1 val sampleDataNNSize = sampleDataNN.size - 1 val rand = new Random() for (j <- 0 to sampleDataNNSize){ val partitionId = sampleDataNN(j)._1 val neighborId = sampleDataNN(j)._3 val sampleFeatures = sampleDataNN(j)._4.features if (partitionId == partitionIndex.toInt){ val currentPoint = dataArr(neighborId) val features = currentPoint._1.features sampleFeatures += (sampleFeatures - features) * rand.nextDouble result.::=("1.0"+delimiter+sampleFeatures.toArray.mkString(delimiter)) } } result.iterator } }
Example 17
Source File: LocalKMeans.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } }
Example 18
Source File: LocalKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 19
Source File: PointObj.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.spark.streamclustering final case class Prototype( var protoPartNum: Vector[Double], var idsDataAssigned : Set[Int], val id: Int ) extends Serializable { override def toString: String = {toStringProto "node: "+id +" -> " + protoPartNum.toArray.deep.mkString(", ") } def toStringIds: String = { "node: " + id + " (" + idsDataAssigned.size + " data-points)" + " -> " + idsDataAssigned.toArray.deep.mkString(", ") } def toStringProto: String = { protoPartNum.toArray.deep.mkString(", ") } def toStringCard: String = { idsDataAssigned.size.toString() } def toStringAss: String = { idsDataAssigned.toArray.deep.mkString(", ") } def toStringId: String = { id.toString() } }
Example 20
Source File: LocalKMeans.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData = { def generatePoint(i: Int) = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } }
Example 21
Source File: SparkLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData = { def generatePoint(i: Int) = { val y = if(i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 22
Source File: LocalFileLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 23
Source File: SparkKMeans.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42).toArray var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } }
Example 24
Source File: LocalLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData = { def generatePoint(i: Int) = { val y = if(i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 25
Source File: SparkHdfsLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo object SparkHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR") val inputPath = args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 26
Source File: SparkTachyonHdfsLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 27
Source File: SparkLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 28
Source File: SparkLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 29
Source File: LocalFileLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 30
Source File: SparkKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42).toArray var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } } // scalastyle:on println
Example 31
Source File: LocalLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 32
Source File: SparkHdfsLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo object SparkHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR") val inputPath = args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 33
Source File: SparkTachyonHdfsLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 34
Source File: OneHotEncoder.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.preprocessing import breeze.linalg.{*, Axis, DenseMatrix, Vector, convert, max} import cats.syntax.option._ import io.picnicml.doddlemodel.data.Feature.FeatureIndex import io.picnicml.doddlemodel.data.Features import io.picnicml.doddlemodel.syntax.OptionSyntax._ import io.picnicml.doddlemodel.typeclasses.Transformer case class OneHotEncoder private (private val numBinaryColumns: Option[Vector[Int]], private val featureIndex: FeatureIndex) object OneHotEncoder { def apply(featureIndex: FeatureIndex): OneHotEncoder = OneHotEncoder(none, featureIndex) @SerialVersionUID(0L) implicit lazy val ev: Transformer[OneHotEncoder] = new Transformer[OneHotEncoder] { @inline override def isFitted(model: OneHotEncoder): Boolean = model.numBinaryColumns.isDefined override def fit(model: OneHotEncoder, x: Features): OneHotEncoder = { val numBinaryColumns = convert(max(x(::, model.featureIndex.categorical.columnIndices).apply(::, *)).t, Int) + 1 model.copy(numBinaryColumns = numBinaryColumns.some) } override protected def transformSafe(model: OneHotEncoder, x: Features): Features = { val xTransformed = model.featureIndex.categorical.columnIndices.zipWithIndex.foldLeft(x) { case (xTransformedCurrent, (colIndex, statisticIndex)) => appendEncodedColumns(xTransformedCurrent, colIndex, model.numBinaryColumns.getOrBreak(statisticIndex)) } xTransformed.delete(model.featureIndex.categorical.columnIndices, Axis._1) } private def appendEncodedColumns(x: Features, columnIndex: Int, numEncodedColumns: Int): Features = { val encoded = DenseMatrix.zeros[Float](x.rows, numEncodedColumns) convert(x(::, columnIndex), Int).iterator.foreach { case (rowIndex, colIndex) => // if value is larger than the maximum value encountered during training it is ignored, // i.e. no value is set in the binary encoded matrix if (colIndex < numEncodedColumns) encoded(rowIndex, colIndex) = 1.0f } DenseMatrix.horzcat(x, encoded) } } }
Example 35
Source File: LocalKMeans.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 36
Source File: LocalLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 37
Source File: SparkHdfsLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 38
Source File: Kinship.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import org.apache.spark.rdd.RDD import org.dizhang.seqspark.ds._ import breeze.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.SparkContext import scala.collection.mutable.ArrayBuffer def removeNums(size: Int, nums: IndexedSeq[Int]): IndexedSeq[Int] = { var j: Int = 0 var i: Int = 0 val res = ArrayBuffer[Int]() while (i < size) { if (j >= nums.length) { res.+=(i) } else if (i == nums(j)) { j += 1 } else { res.+=(i) } i += 1 } res.toIndexedSeq } }
Example 39
Source File: LogisticRegressionModel.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg.Vector import org.apache.spark.mllib.classification.{LogisticRegressionModel => MLlibLRM} import org.apache.spark.mllib.linalg.{Vector => MLlibVector} import org.apache.spark.mllib.optimization.{SquaredL2Updater, LogisticGradient, LBFGS} import org.apache.spark.mllib.regression.{GeneralizedLinearAlgorithm, LabeledPoint} import org.apache.spark.mllib.util.DataValidators import org.apache.spark.rdd.RDD import keystoneml.utils.MLlibUtils.breezeVectorToMLlib import keystoneml.workflow.{LabelEstimator, Transformer} import scala.reflect.ClassTag private[this] class LogisticRegressionWithLBFGS(numClasses: Int, numFeaturesValue: Int) extends GeneralizedLinearAlgorithm[MLlibLRM] with Serializable { this.numFeatures = numFeaturesValue override val optimizer = new LBFGS(new LogisticGradient, new SquaredL2Updater) override protected val validators = List(multiLabelValidator) require(numClasses > 1) numOfLinearPredictor = numClasses - 1 if (numClasses > 2) { optimizer.setGradient(new LogisticGradient(numClasses)) } private def multiLabelValidator: RDD[LabeledPoint] => Boolean = { data => if (numOfLinearPredictor > 1) { DataValidators.multiLabelValidator(numOfLinearPredictor + 1)(data) } else { DataValidators.binaryLabelValidator(data) } } override protected def createModel(weights: MLlibVector, intercept: Double) = { if (numOfLinearPredictor == 1) { new MLlibLRM(weights, intercept) } else { new MLlibLRM(weights, intercept, numFeatures, numOfLinearPredictor + 1) } } } override def fit(in: RDD[T], labels: RDD[Int]): LogisticRegressionModel[T] = { val labeledPoints = labels.zip(in).map(x => LabeledPoint(x._1, breezeVectorToMLlib(x._2))) val trainer = new LogisticRegressionWithLBFGS(numClasses, numFeatures) trainer.setValidateData(false).optimizer.setNumIterations(numIters).setRegParam(regParam) val model = trainer.run(labeledPoints) new LogisticRegressionModel(model) } }
Example 40
Source File: RowBlockIterator.scala From glint with MIT License | 5 votes |
package glint.iterators import akka.util.Timeout import breeze.linalg.Vector import glint.models.client.BigMatrix import scala.concurrent.{ExecutionContext, Future} class RowBlockIterator[V](val matrix: BigMatrix[V], val blockSize: Int)(implicit ec: ExecutionContext) extends PipelineIterator[Array[Vector[V]]] { if (matrix.cols == 0 || matrix.rows == 0) { total = 0 } else { val inc = if (matrix.rows % blockSize == 0) { 0 } else { 1 } total = inc + (matrix.rows / blockSize).toInt } override protected def fetchNextFuture(): Future[Array[Vector[V]]] = { val nextRows = (index.toLong * blockSize until Math.min(matrix.rows, (index + 1) * blockSize)).toArray matrix.pull(nextRows) } }
Example 41
Source File: RowIterator.scala From glint with MIT License | 5 votes |
package glint.iterators import akka.util.Timeout import breeze.linalg.Vector import glint.models.client.BigMatrix import scala.concurrent.ExecutionContext class RowIterator[V](matrix: BigMatrix[V], blockSize: Int = 100)(implicit val ec: ExecutionContext) extends Iterator[Vector[V]] { // Row progress var index: Long = 0 val rows: Long = if (matrix.rows == 0 || matrix.cols == 0) { 0L } else { matrix.rows } // The underlying block iterator val blockIterator = new RowBlockIterator[V](matrix, blockSize) // The local block progress var localIndex: Int = 0 var localSize: Int = 0 var block = new Array[Vector[V]](0) override def hasNext: Boolean = index < rows override def next(): Vector[V] = { if (localIndex >= localSize) { block = blockIterator.next() localIndex = 0 localSize = block.length } localIndex += 1 index += 1 block(localIndex - 1) } }
Example 42
Source File: GranularBigMatrix.scala From glint with MIT License | 5 votes |
package glint.models.client.granular import scala.collection.mutable.ArrayBuffer import scala.concurrent.{ExecutionContext, Future} import scala.reflect.ClassTag import breeze.linalg.Vector import glint.models.client.BigMatrix override def pull(rows: Array[Long], cols: Array[Int])(implicit ec: ExecutionContext): Future[Array[V]] = { if (rows.length <= maximumMessageSize) { underlying.pull(rows, cols) } else { var i = 0 val ab = new ArrayBuffer[Future[Array[V]]](rows.length / maximumMessageSize) while (i < rows.length) { val end = Math.min(rows.length, i + maximumMessageSize) val future = underlying.pull(rows.slice(i, end), cols.slice(i, end)) ab.append(future) i += maximumMessageSize } Future.sequence(ab.toIterator).map { case arrayOfValues => val finalValues = new ArrayBuffer[V](rows.length) arrayOfValues.foreach(x => finalValues.appendAll(x)) finalValues.toArray } } } }
Example 43
Source File: SparkKMeans.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.basic import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- centers.indices) { // 最近距离计算 val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans").setMaster("local") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector).cache() // inital K 值 val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } } // scalastyle:on println
Example 44
Source File: SparkKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{squaredDistance, DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkKMeans") .getOrCreate() val lines = spark.read.textFile(args(0)).rdd val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) spark.stop() } } // scalastyle:on println
Example 45
Source File: SparkLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 46
Source File: LocalFileLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 47
Source File: SparkKMeans.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{squaredDistance, DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkKMeans") .getOrCreate() val lines = spark.read.textFile(args(0)).rdd val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) spark.stop() } } // scalastyle:on println
Example 48
Source File: LocalLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 49
Source File: SparkHdfsLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 50
Source File: Sampler.scala From glintlda with MIT License | 5 votes |
package glintlda.naive import breeze.linalg.{DenseVector, Vector} import breeze.stats.distributions.Multinomial import glintlda.LDAConfig import glintlda.util.FastRNG def sampleFeature(feature: Int, oldTopic: Int): Int = { var i = 0 val p = DenseVector.zeros[Double](config.topics) var sum = 0.0 while (i < config.topics) { p(i) = (documentCounts(i) + α) * ((wordCounts(i) + β) / (globalCounts(i) + βSum)) sum += p(i) i += 1 } p /= sum Multinomial(p).draw() } }
Example 51
Source File: AliasTable.scala From glintlda with MIT License | 5 votes |
package glintlda.mh import breeze.linalg.{Vector, sum} import glintlda.util.FastRNG def draw(random: FastRNG): Int = { count += 1 val i = random.nextPositiveInt() % alias.length if (random.nextDouble() < prob(i)) { i } else { alias(i) } } }
Example 52
Source File: SparkKMeans.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{squaredDistance, DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkKMeans") .getOrCreate() val lines = spark.read.textFile(args(0)).rdd val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) spark.stop() } } // scalastyle:on println
Example 53
Source File: LocalFileLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 54
Source File: SparkKMeans.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42).toArray var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } }
Example 55
Source File: LocalLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 56
Source File: SparkHdfsLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo object SparkHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR") val inputPath = args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 57
Source File: SparkTachyonHdfsLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 58
Source File: LocalKMeans.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 59
Source File: SparkLR.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 60
Source File: LocalFileLR.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 61
Source File: Sampler.scala From glintlda with MIT License | 5 votes |
package glintlda.mh import breeze.linalg.Vector import glintlda.LDAConfig import glintlda.util.FastRNG def sampleFeature(feature: Int, oldTopic: Int): Int = { var s: Int = oldTopic var mh: Int = 0 // Each Metropolis-Hastings step alternates between a word proposal and doc proposal while(mh < mhSteps) { // Set t var t: Int = s // Word Proposal t = aliasTable.draw(random) if (t != s) { var docS = documentCounts(s) + α var docT = documentCounts(t) + α var wordS = wordCounts(s) + β var wordT = wordCounts(t) + β var globalS = globalCounts(s) + βSum var globalT = globalCounts(t) + βSum val proposalS = wordS / globalS val proposalT = wordT / globalT if (s == oldTopic) { docS -= 1 wordS -= infer globalS -= infer } if (t == oldTopic) { docT -= 1 wordT -= infer globalT -= infer } val pi = (docT * wordT * globalS * proposalS) / (docS * wordS * globalT * proposalT) if (random.nextDouble() < pi) { s = t } } // Document proposal val pickOrExplore = random.nextDouble() * (documentSize + αSum) if (pickOrExplore < documentSize) { t = documentTopicAssignments(pickOrExplore.toInt) } else { t = random.nextPositiveInt() % config.topics } if (t != s) { var docS = documentCounts(s) + α var docT = documentCounts(t) + α var wordS = wordCounts(s) + β var wordT = wordCounts(t) + β var globalS = globalCounts(s) + βSum var globalT = globalCounts(t) + βSum val proposalS = docS val proposalT = docT if (s == oldTopic) { docS -= 1 wordS -= infer globalS -= infer } if (t == oldTopic) { docT -= 1 wordT -= infer globalT -= infer } val pi = (docT * wordT * globalS * proposalS) / (docS * wordS * globalT * proposalT) if (random.nextDouble() < pi) { s = t } } mh += 1 } s } }
Example 62
Source File: LocalLR.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 63
Source File: SparkHdfsLR.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 64
Source File: LinearRegressionModel.scala From hail with MIT License | 5 votes |
package is.hail.stats import breeze.linalg.{Matrix, Vector} import is.hail.annotations.Annotation import is.hail.types.virtual.{TFloat64, TStruct} import net.sourceforge.jdistlib.T object LinearRegressionModel { def schema = TStruct( ("beta", TFloat64), ("se", TFloat64), ("t_stat", TFloat64), ("p_value", TFloat64)) def fit(x: Vector[Double], y: Vector[Double], yyp: Double, qt: Matrix[Double], qty: Vector[Double], d: Int): Annotation = { val qtx = qt * x val xxp = (x dot x) - (qtx dot qtx) val xyp = (x dot y) - (qtx dot qty) val b = xyp / xxp val se = math.sqrt((yyp / xxp - b * b) / d) val t = b / se val p = 2 * T.cumulative(-math.abs(t), d, true, false) Annotation(b, se, t, p) } }
Example 65
Source File: package.scala From hail with MIT License | 5 votes |
package is.hail import is.hail.stats._ import breeze.linalg.{Vector, DenseVector, max, sum} import breeze.numerics._ import is.hail.utils._ package object experimental { def findMaxAC(af: Double, an: Int, ci: Double = .95): Int = { if (af == 0) 0 else { val quantile_limit = ci // ci for one-sided, 1-(1-ci)/2 for two-sided val max_ac = qpois(quantile_limit, an * af) max_ac } } def calcFilterAlleleFreq(ac: Int, an: Int, ci: Double = .95, lower: Double = 1e-10, upper: Double = 2, tol: Double = 1e-7, precision: Double = 1e-6): Double = { if (ac <= 1 || an == 0) // FAF should not be calculated on singletons 0.0 else { var f = (af: Double) => ac.toDouble - 1 - qpois(ci, an.toDouble * af) val root = uniroot(f, lower, upper, tol) val rounder = 1d / (precision / 100d) var max_af = math.round(root.getOrElse(0.0) * rounder) / rounder while (findMaxAC(max_af, an, ci) < ac) { max_af += precision } max_af - precision } } def calcFilterAlleleFreq(ac: Int, an: Int, ci: Double): Double = calcFilterAlleleFreq(ac, an, ci, lower = 1e-10, upper = 2, tol = 1e-7, precision = 1e-6) def haplotypeFreqEM(gtCounts : IndexedSeq[Int]) : IndexedSeq[Double] = { assert(gtCounts.size == 9, "haplotypeFreqEM requires genotype counts for the 9 possible genotype combinations.") val _gtCounts = new DenseVector(gtCounts.toArray) val nSamples = sum(_gtCounts) //Needs some non-ref samples to compute if(_gtCounts(0) >= nSamples){ return FastIndexedSeq(_gtCounts(0),0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0)} val nHaplotypes = 2.0*nSamples.toDouble val const_counts = new DenseVector(Array[Double]( 2.0*_gtCounts(0) + _gtCounts(1) + _gtCounts(3), //n.AB 2.0*_gtCounts(6) + _gtCounts(3) + _gtCounts(7), //n.Ab 2.0*_gtCounts(2) + _gtCounts(1) + _gtCounts(5), //n.aB 2.0*_gtCounts(8) + _gtCounts(5) + _gtCounts(7) //n.ab )) //Initial estimate with AaBb contributing equally to each haplotype var p_next = (const_counts +:+ new DenseVector(Array.fill[Double](4)(_gtCounts(4)/2.0))) /:/ nHaplotypes var p_cur = p_next +:+ 1.0 //EM while(max(abs(p_next -:- p_cur)) > 1e-7){ p_cur = p_next p_next = (const_counts +:+ (new DenseVector(Array[Double]( p_cur(0)*p_cur(3), //n.AB p_cur(1)*p_cur(2), //n.Ab p_cur(1)*p_cur(2), //n.aB p_cur(0)*p_cur(3) //n.ab )) * (_gtCounts(4) / ((p_cur(0)*p_cur(3))+(p_cur(1)*p_cur(2))))) ) / nHaplotypes } return (p_next *:* nHaplotypes).toArray.toFastIndexedSeq } }
Example 66
Source File: LocalKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 67
Source File: SparkLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 68
Source File: LocalFileLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println