breeze.linalg.DenseVector Scala Examples
The following examples show how to use breeze.linalg.DenseVector.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: SparkHdfsLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo object SparkHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR") val inputPath = args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 2
Source File: Integrator.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.numerics import breeze.linalg.DenseVector import scalismo.common.{Scalar, VectorField} import scalismo.geometry._ import scalismo.image.ScalarImage case class Integrator[D: NDSpace](sampler: Sampler[D]) { def integrateScalar[A: Scalar](img: ScalarImage[D, A]): A = { integrateScalar(img.liftValues) } def integrateScalar[A: Scalar](f: Function1[Point[D], Option[A]]): A = { val scalar = Scalar[A] val zero = scalar.fromInt(0) val samples = sampler.sample val sum = samples.par.map { case (pt, p) => scalar.toDouble(f(pt).getOrElse(zero)) * (1.0 / p.toFloat) }.sum scalar.fromDouble(sum / samples.size) } def integrateVector[DO: NDSpace](img: VectorField[D, DO]): EuclideanVector[DO] = { integrateVector(img.liftValues) } def integrateVector[DO: NDSpace](f: Function1[Point[D], Option[EuclideanVector[DO]]]): EuclideanVector[DO] = { val samples = sampler.sample val zeroVector = EuclideanVector.zeros[DO] val sum = samples.par .map { case (pt, p) => f(pt).getOrElse(zeroVector) * (1.0 / p) } .foldLeft(zeroVector)((a, b) => { a + b }) sum * (1f / (sampler.numberOfPoints - 1)) } def integrateVector(f: Function1[Point[D], Option[DenseVector[Double]]], dimensionality: Int): DenseVector[Double] = { val samples = sampler.sample val zeroVector = DenseVector.zeros[Double](dimensionality) val sum = samples.par .map { case (pt, p) => f(pt).getOrElse(zeroVector) * (1.0 / p) } .foldLeft(zeroVector)((a, b) => { a + b }) sum * (1.0 / (sampler.numberOfPoints - 1)) } }
Example 3
Source File: Registration.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.registration import breeze.linalg.DenseVector import scalismo.numerics._ import scalismo.registration.TransformationSpace.ParameterVector def iterator(initialParameters: DenseVector[Double]): Iterator[RegistrationState] = { val costFunction = new CostFunction { def onlyValue(params: ParameterVector): Double = { metric.value(params) + regularizationWeight * regularizer.value(params) } def apply(params: ParameterVector): (Double, DenseVector[Double]) = { // compute the value of the cost function val metricValueAndDerivative = metric.valueAndDerivative(params) val value = metricValueAndDerivative.value + regularizationWeight * regularizer.value(params) val dR = regularizer.takeDerivative(params) (value, metricValueAndDerivative.derivative + dR * regularizationWeight) } } optimizer.iterations(initialParameters, costFunction).map { optimizerState => RegistrationState(optimizerState.value, optimizerState.parameters, optimizerState) } } }
Example 4
Source File: GaussianProcessTransformationSpace.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.registration import breeze.linalg.{DenseMatrix, DenseVector} import scalismo.geometry.EuclideanVector.VectorVectorizer import scalismo.geometry.{EuclideanVector, Point} import scalismo.registration.TransformationSpace.ParameterVector import scalismo.statisticalmodel.LowRankGaussianProcess import scalismo.statisticalmodel.LowRankGaussianProcess.Eigenpair class GaussianProcessTransformationSpace[D] private (gp: LowRankGaussianProcess[D, EuclideanVector[D]])( implicit vectorizer: VectorVectorizer[D] ) extends TransformationSpace[D] { override type T = GaussianProcessTransformation[D] override def identityTransformParameters = DenseVector.zeros[Double](parametersDimensionality) override def parametersDimensionality = gp.rank override def transformForParameters(p: ParameterVector) = GaussianProcessTransformation[D](gp, p) override def takeDerivativeWRTParameters(p: ParameterVector) = { (x: Point[D]) => { val dim = x.dimensionality val J = DenseMatrix.zeros[Double](dim, gp.klBasis.size) (0 until gp.rank).map(i => { val Eigenpair(lambda_i, phi_i) = gp.klBasis(i) J(::, i) := vectorizer.vectorize(phi_i(x)) * math.sqrt(lambda_i) }) J } } } class GaussianProcessTransformation[D] private (gp: LowRankGaussianProcess[D, EuclideanVector[D]], alpha: ParameterVector) extends ParametricTransformation[D] { val instance = gp.instance(alpha) val parameters = alpha override val domain = gp.domain override val f = (x: Point[D]) => { val newPointAsVector = instance(x) x + newPointAsVector } } object GaussianProcessTransformation { def apply[D](gp: LowRankGaussianProcess[D, EuclideanVector[D]], alpha: TransformationSpace.ParameterVector) = { new GaussianProcessTransformation[D](gp, alpha) } } object GaussianProcessTransformationSpace { def apply[D](gp: LowRankGaussianProcess[D, EuclideanVector[D]])(implicit vectorizer: VectorVectorizer[D]) = { new GaussianProcessTransformationSpace[D](gp) } }
Example 5
Source File: StatisticalVolumeIntensityModel.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.statisticalmodel.experimental import breeze.linalg.DenseVector import scalismo.common._ import scalismo.geometry._ import scalismo.mesh._ import scalismo.statisticalmodel.DiscreteLowRankGaussianProcess import scalismo.utils.Random import scala.reflect.ClassTag import scala.reflect.runtime.universe.TypeTag trait StatisticalVolumeIntensityModel[S] { def referenceMeshField: ScalarVolumeMeshField[S] def shape: StatisticalVolumeMeshModel def intensity: DiscreteLowRankGaussianProcess[_3D, UnstructuredPointsDomain[_3D], S] def mean: ScalarVolumeMeshField[S] def instance(coefficients: SVIMCoefficients): ScalarVolumeMeshField[S] def sample()(implicit rnd: Random): ScalarVolumeMeshField[S] def zeroCoefficients: SVIMCoefficients } object StatisticalVolumeIntensityModel { def apply[S: Scalar: TypeTag: ClassTag]( referenceMeshField: ScalarVolumeMeshField[S], shape: StatisticalVolumeMeshModel, intensity: DiscreteLowRankGaussianProcess[_3D, UnstructuredPointsDomain[_3D], S] ): SVIM[S] = { SVIM(referenceMeshField, shape, intensity) } } case class SVIM[S: Scalar: TypeTag: ClassTag]( referenceMeshField: ScalarVolumeMeshField[S], shape: StatisticalVolumeMeshModel, intensity: DiscreteLowRankGaussianProcess[_3D, UnstructuredPointsDomain[_3D], S] ) extends StatisticalVolumeIntensityModel[S] { override def mean: ScalarVolumeMeshField[S] = { ScalarVolumeMeshField(shape.mean, warpReferenceIntensity(intensity.mean.data)) } override def instance(coefficients: SVIMCoefficients): ScalarVolumeMeshField[S] = { ScalarVolumeMeshField(shape.instance(coefficients.shape), warpReferenceIntensity(intensity.instance(coefficients.intensity).data)) } override def sample()(implicit rnd: Random): ScalarVolumeMeshField[S] = { ScalarVolumeMeshField(shape.sample(), warpReferenceIntensity(intensity.sample().data)) } override def zeroCoefficients: SVIMCoefficients = SVIMCoefficients( DenseVector.zeros[Double](shape.rank), DenseVector.zeros[Double](intensity.rank) ) def truncate(shapeComps: Int, colorComps: Int): SVIM[S] = { require(shapeComps >= 0 && shapeComps <= shape.rank, "illegal number of reduced shape components") require(colorComps >= 0 && colorComps <= intensity.rank, "illegal number of reduced color components") SVIM( referenceMeshField, shape.truncate(shapeComps), intensity.truncate(colorComps) ) } private def warpReferenceIntensity(scalarData: IndexedSeq[S]): ScalarArray[S] = { ScalarArray[S]( referenceMeshField.data .zip(ScalarArray[S](scalarData.toArray)) .map { case (r, s) => Scalar[S].plus(r, s) } .toArray ) } }
Example 6
Source File: SVIMCoefficients.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.statisticalmodel.experimental import breeze.linalg.DenseVector case class SVIMCoefficients(shape: DenseVector[Double], intensity: DenseVector[Double]) { def *(f: Float): SVIMCoefficients = this * f.toDouble def *(d: Double): SVIMCoefficients = copy(shape = shape * d, intensity = intensity * d) def +(other: SVIMCoefficients): SVIMCoefficients = copy(shape = shape + other.shape, intensity = intensity + other.intensity) def -(other: SVIMCoefficients): SVIMCoefficients = copy(shape = shape - other.shape, intensity = intensity - other.intensity) } object SVIMCoefficients { def apply(shape: IndexedSeq[Double], intensity: IndexedSeq[Double]) = new SVIMCoefficients(DenseVector(shape.toArray), DenseVector(intensity.toArray)) def zeros(shapeComponents: Int, intensityComponents: Int): SVIMCoefficients = { new SVIMCoefficients(DenseVector.zeros(shapeComponents), DenseVector.zeros(intensityComponents)) } }
Example 7
Source File: convertOutput.scala From SparkAndMPIFactorizations with MIT License | 5 votes |
package org.apache.spark.mllib.linalg.distributed import breeze.linalg.{DenseMatrix, DenseVector} import java.io.{DataInputStream, FileInputStream, FileWriter, File} object ConvertDump { type DM = DenseMatrix[Double] type DDV = DenseVector[Double] type DIV = DenseVector[Int] def loadDoubleVector( inf: DataInputStream) : DDV = { val len = inf.readInt() val v = DenseVector.zeros[Double](len) for (i <- 0 until len) { v(i) = inf.readDouble() } v } def loadIntVector( inf: DataInputStream) : DIV = { val len = inf.readInt() val v = DenseVector.zeros[Int](len) for (i <- 0 until len) { v(i) = inf.readInt() } v } def loadMatrix( inf: DataInputStream) : DM = { val (r,c) = Tuple2(inf.readInt(), inf.readInt()) val m = DenseMatrix.zeros[Double](r,c) for (i <- 0 until r; j <- 0 until c) { m(i,j) = inf.readDouble() } m } def loadDump(infname: String) : Tuple4[DM, DM, DDV, DDV] = { val inf = new DataInputStream( new FileInputStream(infname)) val eofsU = loadMatrix(inf) val eofsV = loadMatrix(inf) val evals = loadDoubleVector(inf) val mean = loadDoubleVector(inf) inf.close() (eofsU, eofsV, evals, mean) } def writeDoubleMatrix(mat: DM, fn: String) = { val writer = new FileWriter(new File(fn)) writer.write("%%MatrixMarket matrix coordinate real general\n") writer.write(s"${mat.rows} ${mat.cols} ${mat.rows*mat.cols}\n") for(i <- 0 until mat.rows) { for(j <- 0 until mat.cols) { writer.write(f"${i+1} ${j+1} ${mat(i, j)}%f\n") } } writer.close } def writeIntVector(vec: DIV, fn: String) = { val mat = vec.asDenseMatrix val writer = new FileWriter(new File(fn)) writer.write("%%MatrixMarket matrix coordinate real general\n") writer.write(s"${mat.rows} ${mat.cols} ${mat.rows*mat.cols}\n") for(i <- 0 until mat.rows) { for(j <- 0 until mat.cols) { writer.write(s"${i+1} ${j+1} ${mat(i, j)}\n") } } writer.close } def main(args: Array[String]) { val (eofsU, eofsV, eofsS, mean) = loadDump(args(0)) writeDoubleMatrix(eofsU, s"${args(1)}/colEOFs") writeDoubleMatrix(eofsV, s"${args(1)}/rowEOFs") writeDoubleMatrix(eofsS.asDenseMatrix, s"${args(1)}/evalEOFs") writeDoubleMatrix(mean.asDenseMatrix, s"${args(1)}/rowMeans") } }
Example 8
Source File: LocalKMeans.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData = { def generatePoint(i: Int) = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } }
Example 9
Source File: SparkLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData = { def generatePoint(i: Int) = { val y = if(i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 10
Source File: LocalFileLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 11
Source File: SparkKMeans.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42).toArray var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((x1, y1), (x2, y2)) => (x1 + x2, y1 + y2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } }
Example 12
Source File: LocalLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData = { def generatePoint(i: Int) = { val y = if(i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 13
Source File: MetropolisHastingsTests.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.sampling import breeze.linalg.{DenseMatrix, DenseVector} import scalismo.ScalismoTestSuite import scalismo.sampling.algorithms.{MetropolisHastings, MetropolisHastingsWithPrefetching} import scalismo.sampling.evaluators.GaussianEvaluator import scalismo.statisticalmodel.MultivariateNormalDistribution class MetropolisHastingsTests extends ScalismoTestSuite { implicit val rng = scalismo.utils.Random(42) val gaussianProposal = new ProposalGenerator[Double] with TransitionProbability[Double] with SymmetricTransitionRatio[Double] { val sdev = 1.0 override def logTransitionProbability(from: Double, to: Double): Double = GaussianEvaluator.logDensity(to, from, sdev) } describe("The metropolis-hastings algorithm") { it("approximates the mean and covariance of a normal distribution") { val mean = 1.0 val sdev = 3.5 val evaluator = GaussianEvaluator(mean, sdev) val mh = MetropolisHastings(gaussianProposal, evaluator) val samples = mh.iterator(0.0).drop(100000).take(100000).toIndexedSeq val approximatedMean = samples.sum / samples.size val approximatedVariance = samples.map(sample => (sample - mean) * (sample - mean)).sum / samples.size approximatedMean should be(mean +- 1e-1) Math.sqrt(approximatedVariance) should be(sdev +- 5e-1) } } describe("The metropolis-hastings algorithm with prefetching") { it("approximates the mean and covariance of a normal distribution") { val mean = 1.0 val sdev = 3.5 val evaluator = GaussianEvaluator(mean, sdev) val mh = MetropolisHastingsWithPrefetching(gaussianProposal, evaluator) val samples = mh.iterator(0.0).drop(100000).take(100000).toIndexedSeq val approximatedMean = samples.sum / samples.size val approximatedVariance = samples.map(sample => (sample - mean) * (sample - mean)).sum / samples.size approximatedMean should be(mean +- 1e-1) Math.sqrt(approximatedVariance) should be(sdev +- 5e-1) } } }
Example 14
Source File: SparkTachyonHdfsLR.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 15
Source File: LocalKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 16
Source File: SparkLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 17
Source File: LocalFileLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 18
Source File: SparkKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42).toArray var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } } // scalastyle:on println
Example 19
Source File: LocalLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 20
Source File: SparkHdfsLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo object SparkHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR") val inputPath = args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 21
Source File: SparkTachyonHdfsLR.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 22
Source File: ExactSimilarityReference.scala From elastiknn with Apache License 2.0 | 5 votes |
package com.klibisz.elastiknn.testing import breeze.linalg.DenseVector import com.klibisz.elastiknn.api.Vec import breeze.linalg.functions._ object ExactSimilarityReference { val L2: (Vec.DenseFloat, Vec.DenseFloat) => Double = (v1: Vec.DenseFloat, v2: Vec.DenseFloat) => { 1 / (1 + euclideanDistance(new DenseVector(v1.values), new DenseVector(v2.values))) } val L1: (Vec.DenseFloat, Vec.DenseFloat) => Double = (v1: Vec.DenseFloat, v2: Vec.DenseFloat) => { 1 / (1 + manhattanDistance(new DenseVector(v1.values), new DenseVector(v2.values))) } val Angular: (Vec.DenseFloat, Vec.DenseFloat) => Double = (v1: Vec.DenseFloat, v2: Vec.DenseFloat) => { 1 + (1 - cosineDistance(new DenseVector(v1.values.map(_.toDouble)), new DenseVector(v2.values.map(_.toDouble)))) } val Hamming: (Vec.SparseBool, Vec.SparseBool) => Double = (v1: Vec.SparseBool, v2: Vec.SparseBool) => { val d1 = new Array[Boolean](v1.totalIndices) val d2 = new Array[Boolean](v2.totalIndices) v1.trueIndices.foreach(i => d1.update(i, true)) v2.trueIndices.foreach(i => d2.update(i, true)) d1.zip(d2).count { case (a, b) => a == b } * 1d / d1.length } val Jaccard = (v1: Vec.SparseBool, v2: Vec.SparseBool) => { val isec = v1.trueIndices.intersect(v2.trueIndices).length val denom = v1.trueIndices.length + v2.trueIndices.length - isec if (isec == 0 && denom == 0) 1d else if (denom > 0) isec * 1d / denom else 0d } }
Example 23
Source File: min-ppl.scala From blog with Apache License 2.0 | 5 votes |
object MinPpl { import breeze.stats.{distributions => bdist} import breeze.linalg.DenseVector implicit val numParticles = 300 case class Particle[T](v: T, lw: Double) { // value and log-weight def map[S](f: T => S): Particle[S] = Particle(f(v), lw) } trait Prob[T] { val particles: Vector[Particle[T]] def map[S](f: T => S): Prob[S] = Empirical(particles map (_ map f)) def flatMap[S](f: T => Prob[S]): Prob[S] = { Empirical((particles map (p => { f(p.v).particles.map(psi => Particle(psi.v, p.lw + psi.lw)) })).flatten).resample } def resample(implicit N: Int): Prob[T] = { val lw = particles map (_.lw) val mx = lw reduce (math.max(_,_)) val rw = lw map (lwi => math.exp(lwi - mx)) val law = mx + math.log(rw.sum/(rw.length)) val ind = bdist.Multinomial(DenseVector(rw.toArray)).sample(N) val newParticles = ind map (i => particles(i)) Empirical(newParticles.toVector map (pi => Particle(pi.v, law))) } def cond(ll: T => Double): Prob[T] = Empirical(particles map (p => Particle(p.v, p.lw + ll(p.v)))) def empirical: Vector[T] = resample.particles.map(_.v) } case class Empirical[T](particles: Vector[Particle[T]]) extends Prob[T] def unweighted[T](ts: Vector[T], lw: Double = 0.0): Prob[T] = Empirical(ts map (Particle(_, lw))) trait Dist[T] extends Prob[T] { def ll(obs: T): Double def ll(obs: Seq[T]): Double = obs map (ll) reduce (_+_) def fit(obs: Seq[T]): Prob[T] = Empirical(particles map (p => Particle(p.v, p.lw + ll(obs)))) def fitQ(obs: Seq[T]): Prob[T] = Empirical(Vector(Particle(obs.head, ll(obs)))) def fit(obs: T): Prob[T] = fit(List(obs)) def fitQ(obs: T): Prob[T] = fitQ(List(obs)) } case class Normal(mu: Double, v: Double)(implicit N: Int) extends Dist[Double] { lazy val particles = unweighted(bdist.Gaussian(mu, math.sqrt(v)).sample(N).toVector).particles def ll(obs: Double) = bdist.Gaussian(mu, math.sqrt(v)).logPdf(obs) } case class Gamma(a: Double, b: Double)(implicit N: Int) extends Dist[Double] { lazy val particles = unweighted(bdist.Gamma(a, 1.0/b).sample(N).toVector).particles def ll(obs: Double) = bdist.Gamma(a, 1.0/b).logPdf(obs) } case class Poisson(mu: Double)(implicit N: Int) extends Dist[Int] { lazy val particles = unweighted(bdist.Poisson(mu).sample(N).toVector).particles def ll(obs: Int) = bdist.Poisson(mu).logProbabilityOf(obs) } }
Example 24
Source File: min-ppl.scala From blog with Apache License 2.0 | 5 votes |
object MinPpl2 { import breeze.stats.{distributions => bdist} import breeze.linalg.DenseVector import cats._ import cats.implicits._ implicit val numParticles = 2000 case class Particle[T](v: T, lw: Double) { // value and log-weight def map[S](f: T => S): Particle[S] = Particle(f(v), lw) def flatMap[S](f: T => Particle[S]): Particle[S] = { val ps = f(v) Particle(ps.v, lw + ps.lw) } } implicit val particleMonad = new Monad[Particle] { def pure[T](t: T): Particle[T] = Particle(t, 0.0) def flatMap[T,S](pt: Particle[T])(f: T => Particle[S]): Particle[S] = pt.flatMap(f) def tailRecM[T,S](t: T)(f: T => Particle[Either[T,S]]): Particle[S] = ??? } trait Prob[T] { val particles: Vector[Particle[T]] def draw: Particle[T] def mapP[S](f: T => Particle[S]): Prob[S] = Empirical(particles map (_ flatMap f)) def map[S](f: T => S): Prob[S] = mapP(v => Particle(f(v), 0.0)) def flatMap[S](f: T => Prob[S]): Prob[S] = mapP(f(_).draw) def resample(implicit N: Int): Prob[T] = { val lw = particles map (_.lw) val mx = lw reduce (math.max(_,_)) val rw = lw map (lwi => math.exp(lwi - mx)) val law = mx + math.log(rw.sum/(rw.length)) val ind = bdist.Multinomial(DenseVector(rw.toArray)).sample(N) val newParticles = ind map (i => particles(i)) Empirical(newParticles.toVector map (pi => Particle(pi.v, law))) } def cond(ll: T => Double): Prob[T] = mapP(v => Particle(v, ll(v))) def empirical: Vector[T] = resample.particles.map(_.v) } implicit val probMonad = new Monad[Prob] { def pure[T](t: T): Prob[T] = Empirical(Vector(Particle(t, 0.0))) def flatMap[T,S](pt: Prob[T])(f: T => Prob[S]): Prob[S] = pt.flatMap(f) def tailRecM[T,S](t: T)(f: T => Prob[Either[T,S]]): Prob[S] = ??? } case class Empirical[T](particles: Vector[Particle[T]]) extends Prob[T] { def draw: Particle[T] = { val lw = particles map (_.lw) val mx = lw reduce (math.max(_,_)) val rw = lw map (lwi => math.exp(lwi - mx)) val law = mx + math.log(rw.sum/(rw.length)) val idx = bdist.Multinomial(DenseVector(rw.toArray)).draw Particle(particles(idx).v, law) } } def unweighted[T](ts: Vector[T], lw: Double = 0.0): Prob[T] = Empirical(ts map (Particle(_, lw))) trait Dist[T] extends Prob[T] { def ll(obs: T): Double def ll(obs: Seq[T]): Double = obs map (ll) reduce (_+_) def fit(obs: Seq[T]): Prob[T] = mapP(v => Particle(v, ll(obs))) def fitQ(obs: Seq[T]): Prob[T] = Empirical(Vector(Particle(obs.head, ll(obs)))) def fit(obs: T): Prob[T] = fit(List(obs)) def fitQ(obs: T): Prob[T] = fitQ(List(obs)) } case class Normal(mu: Double, v: Double)(implicit N: Int) extends Dist[Double] { lazy val particles = unweighted(bdist.Gaussian(mu, math.sqrt(v)). sample(N).toVector).particles def draw = Particle(bdist.Gaussian(mu, math.sqrt(v)).draw, 0.0) def ll(obs: Double) = bdist.Gaussian(mu, math.sqrt(v)).logPdf(obs) } case class Gamma(a: Double, b: Double)(implicit N: Int) extends Dist[Double] { lazy val particles = unweighted(bdist.Gamma(a, 1.0/b). sample(N).toVector).particles def draw = Particle(bdist.Gamma(a, 1.0/b).draw, 0.0) def ll(obs: Double) = bdist.Gamma(a, 1.0/b).logPdf(obs) } case class Poisson(mu: Double)(implicit N: Int) extends Dist[Int] { lazy val particles = unweighted(bdist.Poisson(mu). sample(N).toVector).particles def draw = Particle(bdist.Poisson(mu).draw, 0.0) def ll(obs: Int) = bdist.Poisson(mu).logProbabilityOf(obs) } } // eof
Example 25
Source File: Ledger.scala From deepspark with GNU General Public License v2.0 | 5 votes |
package com.github.nearbydelta.deepspark.word.layer import breeze.linalg.DenseVector import com.esotericsoftware.kryo.Kryo import com.esotericsoftware.kryo.io.{Input, Output} import com.github.nearbydelta.deepspark.data._ import com.github.nearbydelta.deepspark.layer.InputLayer import com.github.nearbydelta.deepspark.word._ import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.reflect.{ClassTag, classTag} trait Ledger[OutInfo] extends InputLayer[Array[Int], OutInfo] { @transient implicit override protected val evidenceI: ClassTag[Array[Int]] = classTag[Array[Int]] @transient var algorithm: LedgerAlgorithm = _ var bcModel: Broadcast[LedgerModel] = _ @transient var builder: LedgerBuilder = _ var dimension: Int = 0 @transient var model: LedgerModel = _ protected var padID = -1 def withModel(model: LedgerModel, builder: LedgerBuilder): this.type = { this.model = model this.builder = builder this.padID = model.padID this.dimension = model.dimension this.algorithm = builder.getUpdater(this.model.vectors) this } protected def pad = if (padID == -1) null else if (bcModel != null) vectorOf(bcModel.value.padID) else vectorOf(padID) protected def updateWord(word: Int, dx: DataVec): Unit = if (word != -1 && algorithm != null) { val vec = algorithm.delta.getOrElseUpdate(word, DenseVector.zeros[Double](dimension)) vec += dx } protected def vectorOf(str: Int) = if (bcModel != null) bcModel.value.vectorAt(str) else model.vectorAt(str) override def broadcast(sc: SparkContext): Unit = { bcModel = sc.broadcast(model) } override def loss: Double = algorithm.loss override def read(kryo: Kryo, input: Input): Unit = { builder = kryo.readClassAndObject(input).asInstanceOf[LedgerBuilder] val model = new LedgerModel model.read(kryo, input) require(model.size > 0, "Model is empty!") withModel(model, builder) super.read(kryo, input) } override def unbroadcast(): Unit = { bcModel.unpersist(blocking = false) } @deprecated override def withInput(in: Int): this.type = this @deprecated override def withOutput(out: Int): this.type = this override def write(kryo: Kryo, output: Output): Unit = { kryo.writeClassAndObject(output, builder) model.write(kryo, output) super.write(kryo, output) } }
Example 26
Source File: BreezeSpec.scala From scio with Apache License 2.0 | 5 votes |
package com.spotify.scio.extra import breeze.linalg.{DenseMatrix, DenseVector, SparseVector} import breeze.stats.distributions.Rand import com.spotify.scio.extra.Breeze._ import com.twitter.algebird.Semigroup import org.scalacheck._ trait BreezeSpec[M[_], T] extends PropertySpec { val dimension = 10 val rows = 20 val cols = 10 val fRand = Rand.uniform.map(_.toFloat) val m: Gen[M[T]] def ms: Gen[List[M[T]]] = Gen.listOf[M[T]](m) def plus(x: M[T], y: M[T])(implicit sg: Semigroup[M[T]]): M[T] = sg.plus(x, y) def sumOption(xs: Iterable[M[T]])(implicit sg: Semigroup[M[T]]): Option[M[T]] = sg.sumOption(xs) } class FloatDenseVectorSpec extends BreezeSpec[DenseVector, Float] { val m = Gen.const(dimension).map(DenseVector.rand[Float](_, fRand)) property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class DoubleDenseVectorSpec extends BreezeSpec[DenseVector, Double] { val m = Gen.const(dimension).map(DenseVector.rand[Double](_)) property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class FloatDenseMatrixSpec extends BreezeSpec[DenseMatrix, Float] { val m = Gen.const((rows, cols)).map { case (r, c) => DenseMatrix.rand[Float](r, c, fRand) } property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class DoubleDenseMatrixSpec extends BreezeSpec[DenseMatrix, Double] { val m = Gen.const((rows, cols)).map { case (r, c) => DenseMatrix.rand[Double](r, c) } property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class FloatSparseVectorSpec extends BreezeSpec[SparseVector, Float] { val m = Gen .const(dimension) .map(d => SparseVector(DenseVector.rand[Float](d, fRand).data)) property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } } class DoubleSparseVectorSpec extends BreezeSpec[SparseVector, Double] { val m = Gen .const(dimension) .map(d => SparseVector(DenseVector.rand[Double](d).data)) property("plus") { forAll(m, m)((x, y) => plus(x, y) == x + y) } property("sumOption") { forAll(ms)(xs => sumOption(xs) == xs.reduceLeftOption(_ + _)) } }
Example 27
Source File: PassiveAggressiveBinaryModelEvaluation.scala From flink-parameter-server with Apache License 2.0 | 5 votes |
package hu.sztaki.ilab.ps.test.utils import breeze.linalg.{DenseVector, SparseVector} import hu.sztaki.ilab.ps.passive.aggressive.algorithm.PassiveAggressiveBinaryAlgorithm import org.slf4j.LoggerFactory class PassiveAggressiveBinaryModelEvaluation object PassiveAggressiveBinaryModelEvaluation { private val log = LoggerFactory.getLogger(classOf[PassiveAggressiveBinaryModelEvaluation]) def accuracy(model: DenseVector[Double], testLines: Traversable[(SparseVector[Double], Option[Boolean])], featureCount: Int, pac: PassiveAggressiveBinaryAlgorithm): Double = { var tt = 0 var ff = 0 var tf = 0 var ft = 0 var cnt = 0 testLines.foreach { case (vector, label) => label match { case Some(lab) => val real = lab val predicted = pac.predict(vector, model) (real, predicted) match { case (true, true) => tt +=1 case (false, false) => ff +=1 case (true, false) => tf +=1 case (false, true) => ft +=1 } cnt += 1 case _ => throw new IllegalStateException("Labels shold not be missing.") } } val percent = ((tt + ff).toDouble / cnt) * 100 percent } }
Example 28
Source File: LinearRegressionExpr.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import breeze.linalg.DenseVector import org.apache.spark.TaskContext import org.apache.spark.sql.SQLUtils import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.codegen.{CodegenContext, ExprCode} import org.apache.spark.sql.catalyst.expressions.{Expression, ImplicitCastInputTypes, TernaryExpression} import org.apache.spark.sql.catalyst.util.ArrayData import org.apache.spark.sql.types._ object LinearRegressionExpr { private val matrixUDT = SQLUtils.newMatrixUDT() private val state = new ThreadLocal[CovariateQRContext] def doLinearRegression(genotypes: Any, phenotypes: Any, covariates: Any): InternalRow = { if (state.get() == null) { // Save the QR factorization of the covariate matrix since it's the same for every row state.set(CovariateQRContext.computeQR(matrixUDT.deserialize(covariates).toDense)) TaskContext.get().addTaskCompletionListener[Unit](_ => state.remove()) } LinearRegressionGwas.linearRegressionGwas( new DenseVector[Double](genotypes.asInstanceOf[ArrayData].toDoubleArray()), new DenseVector[Double](phenotypes.asInstanceOf[ArrayData].toDoubleArray()), state.get() ) } } case class LinearRegressionExpr( genotypes: Expression, phenotypes: Expression, covariates: Expression) extends TernaryExpression with ImplicitCastInputTypes { private val matrixUDT = SQLUtils.newMatrixUDT() override def dataType: DataType = StructType( Seq( StructField("beta", DoubleType), StructField("standardError", DoubleType), StructField("pValue", DoubleType))) override def inputTypes: Seq[DataType] = Seq(ArrayType(DoubleType), ArrayType(DoubleType), matrixUDT) override def children: Seq[Expression] = Seq(genotypes, phenotypes, covariates) override protected def nullSafeEval(genotypes: Any, phenotypes: Any, covariates: Any): Any = { LinearRegressionExpr.doLinearRegression(genotypes, phenotypes, covariates) } override def doGenCode(ctx: CodegenContext, ev: ExprCode): ExprCode = { nullSafeCodeGen( ctx, ev, (genotypes, phenotypes, covariates) => { s""" |${ev.value} = io.projectglow.sql.expressions.LinearRegressionExpr.doLinearRegression($genotypes, $phenotypes, $covariates); """.stripMargin } ) } }
Example 29
Source File: LikelihoodRatioTest.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import breeze.linalg.{DenseMatrix, DenseVector} import org.apache.spark.ml.linalg.{DenseMatrix => SparkDenseMatrix} import org.apache.spark.sql.Encoders import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.types.StructType object LikelihoodRatioTest extends LogitTest { override type FitState = LRTFitState override def fitStatePerPhenotype: Boolean = true override val resultSchema: StructType = Encoders.product[LogitTestResults].schema override def init(phenotypes: Array[Double], covariates: SparkDenseMatrix): LRTFitState = { val nullX = new DenseMatrix(covariates.numRows, covariates.numCols, covariates.values) val y = new DenseVector(phenotypes) val nullFitState = new NewtonIterationsState(covariates.numRows, covariates.numCols) nullFitState.initFromMatrix(nullX, y) val nullFit = LogisticRegressionGwas.newtonIterations(nullX, y, nullX.copy, nullFitState) val fullFitState = new NewtonIterationsState(covariates.numRows, covariates.numCols + 1) val x = DenseMatrix.horzcat(nullX, DenseMatrix.zeros[Double](covariates.numRows, 1)) LRTFitState(x, x.copy, nullFit, fullFitState) } override def runTest( genotypes: DenseVector[Double], phenotypes: DenseVector[Double], fitState: LRTFitState): InternalRow = { fitState.x(::, -1) := genotypes fitState.newtonState.initFromMatrixAndNullFit(fitState.x, phenotypes, fitState.nullFit.args) if (!fitState.nullFit.converged) { return LogitTestResults.nanRow } val fullFit = LogisticRegressionGwas.newtonIterations( fitState.x, phenotypes, fitState.hessian, fitState.newtonState) if (!fullFit.converged) { return LogitTestResults.nanRow } val beta = fullFit.args.b(-1) LogisticRegressionGwas.makeStats( beta, fullFit.args.fisher, fullFit.logLkhd, fitState.nullFit.logLkhd) } } case class LRTFitState( x: DenseMatrix[Double], hessian: DenseMatrix[Double], nullFit: NewtonResult, newtonState: NewtonIterationsState )
Example 30
Source File: LinearRegressionGwas.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import breeze.linalg.DenseVector import org.apache.commons.math3.distribution.TDistribution import org.apache.commons.math3.util.FastMath import org.apache.spark.sql.catalyst.InternalRow import io.projectglow.common.GlowLogging case class RegressionStats(beta: Double, standardError: Double, pValue: Double) object LinearRegressionGwas extends GlowLogging { def runRegression( genotypes: DenseVector[Double], phenotypes: DenseVector[Double], covariateQRContext: CovariateQRContext): RegressionStats = { require( genotypes.length == phenotypes.length, "Number of samples differs between genotype and phenotype arrays") require( covariateQRContext.covQt.cols == genotypes.length, "Number of samples differs between genotype array and covariate matrix") val qtx = covariateQRContext.covQt * genotypes val qty = covariateQRContext.covQt * phenotypes val xdoty = (phenotypes dot genotypes) - (qty dot qtx) val xdotx = (genotypes dot genotypes) - (qtx dot qtx) val ydoty = (phenotypes dot phenotypes) - (qty dot qty) val beta = xdoty / xdotx val standardError = FastMath.sqrt((ydoty / xdotx - beta * beta) / covariateQRContext.degreesOfFreedom) // t-statistic val t = beta / standardError val tDist = new TDistribution(covariateQRContext.degreesOfFreedom) val pvalue = 2 * tDist.cumulativeProbability(-Math.abs(t)) RegressionStats(beta, standardError, pvalue) } def linearRegressionGwas( genotypes: DenseVector[Double], phenotypes: DenseVector[Double], covariateQR: CovariateQRContext): InternalRow = { val regressionStats = runRegression(genotypes, phenotypes, covariateQR) InternalRow(regressionStats.beta, regressionStats.standardError, regressionStats.pValue) } }
Example 31
Source File: SparkMLTestUtils.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml import scala.util.Random import breeze.linalg.DenseVector import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.mllib.random.{GammaGenerator, PoissonGenerator, StandardNormalGenerator} object SparkMLTestUtils { def generateGeneralizedLinearRegressionInput( intercept: Double, coefficients: Array[Double], xMean: Array[Double], xVariance: Array[Double], nPoints: Int, seed: Int, noiseLevel: Double, family: String, link: String): Seq[LabeledPoint] = { val rnd = new Random(seed) def rndElement(i: Int) = { (rnd.nextDouble() - 0.5) * math.sqrt(12.0 * xVariance(i)) + xMean(i) } val (generator, mean) = family match { case "gaussian" => (new StandardNormalGenerator, 0.0) case "poisson" => (new PoissonGenerator(1.0), 1.0) case "gamma" => (new GammaGenerator(1.0, 1.0), 1.0) } generator.setSeed(seed) (0 until nPoints).map { _ => val x = DenseVector(coefficients.indices.map(rndElement).toArray) val w = DenseVector(coefficients) val eta = w.dot(x) + intercept val mu = link match { case "identity" => eta case "log" => math.exp(eta) case "sqrt" => math.pow(eta, 2.0) case "inverse" => 1.0 / eta } val label = mu + noiseLevel * (generator.nextValue() - mean) // Return LabeledPoints with DenseVector LabeledPoint(label, Vectors.dense(x.data)) } } }
Example 32
Source File: MLPClassifier.scala From aardpfark with Apache License 2.0 | 5 votes |
package com.ibm.aardpfark.spark.ml.classification import scala.collection.mutable.ArrayBuffer import com.ibm.aardpfark.pfa.document.{Cell, PFABuilder, PFADocument} import com.ibm.aardpfark.pfa.dsl._ import com.ibm.aardpfark.pfa.expression._ import com.ibm.aardpfark.pfa.types.WithSchema import com.ibm.aardpfark.spark.ml.PFAPredictionModel import breeze.linalg.{DenseMatrix, DenseVector} import com.sksamuel.avro4s.{AvroNamespace, AvroSchema} import org.apache.avro.Schema import org.apache.spark.ml.classification.MultilayerPerceptronClassificationModel @AvroNamespace("com.ibm.aardpfark.exec.spark.ml.classification") case class Layer(weights: Array[Array[Double]], bias: Array[Double]) @AvroNamespace("com.ibm.aardpfark.exec.spark.ml.classification") case class Layers(layers: Seq[Layer]) extends WithSchema { override def schema: Schema = AvroSchema[this.type] } class PFAMultilayerPerceptronClassificationModel( override val sparkTransformer: MultilayerPerceptronClassificationModel) extends PFAPredictionModel[Layers] { private def getLayers = { val weights = sparkTransformer.weights.toArray val inputLayers = sparkTransformer.layers val layers = ArrayBuffer[Layer]() var offset = 0 for (i <- 0 to inputLayers.size - 2) { val in = inputLayers(i) val out = inputLayers(i + 1) val wOffset = out * in val wData = weights.slice(offset, offset + wOffset) val bData = weights.slice(offset + wOffset, offset + wOffset + out) val w = Array.ofDim[Double](out, in) new DenseMatrix[Double](out, in, wData).foreachPair { case ((ii, jj), v) => w(ii)(jj) = v } val b = new DenseVector[Double](bData).toArray layers += Layer(w, b) offset += wOffset + out } layers.toArray } override protected def cell = Cell(Layers(getLayers)) private val doubleSigmoid = NamedFunctionDef("doubleSigmoid", FunctionDef[Double, Double]( "x", m.link.logit("x") )) override def action: PFAExpression = { val forward = model.neural.simpleLayers(inputExpr, modelCell.ref("layers"), doubleSigmoid.ref) val softmax = m.link.softmax(forward) NewRecord(outputSchema, Map(predictionCol -> a.argmax(softmax))) } override def pfa: PFADocument = { PFABuilder() .withName(sparkTransformer.uid) .withMetadata(getMetadata) .withInput(inputSchema) .withOutput(outputSchema) .withCell(modelCell) .withFunction(doubleSigmoid) .withAction(action) .pfa } }
Example 33
Source File: TensorCommons.scala From Clustering4Ever with Apache License 2.0 | 5 votes |
package org.clustering4ever.scala.clustering.tensor object TensorCommons { // The index of top k element of the vector def obtainTopkIndices[@specialized(Int, Double) N](vector: DenseVector[N], k: Int)(implicit num: SNumeric[N], ev: ClassTag[N]): Array[Int] = vector.toArray.zipWithIndex.sortWith( (x, y) => num.gt(x._1, y._1) ).take(k).map(_._2) //Represent the data as a tensor. def dataToTensor(data: Array[Array[Double]], n1: Int, n2: Int, n3: Int): ArrayBuffer[DenseMatrix[Double]] ={ val r = data.length val c = data.head.length def datatomatrix(buf: Array[Array[Double]], m: DenseMatrix[Double]): DenseMatrix[Double] = { (0 until r).foreach{ i => (0 until c).foreach{ j => m(i,j) = data(i)(j) } } m } var dm = datatomatrix(data, DenseMatrix.zeros[Double](r,c)) def todm(ds: DenseMatrix[Double], t: ArrayBuffer[DenseMatrix[Double]], a: Int, b: Int, c: Int): ArrayBuffer[DenseMatrix[Double]] = { // to do in tailrec var h = 0 (0 until c).foreach{ k => var m = DenseMatrix.zeros[Double](a, b) (0 until a).foreach{ i => (0 until b).foreach{ j => h = j + (k * b) m(i, j) = ds(i, h) } } t += m } t } val tens = ArrayBuffer.empty[DenseMatrix[Double]] val tensor = todm(dm, tens, n1, n2, n3) tensor } }
Example 34
Source File: TestXOR.scala From deepspark with GNU General Public License v2.0 | 5 votes |
import breeze.linalg.DenseVector import com.github.nearbydelta.deepspark.data._ import com.github.nearbydelta.deepspark.layer.{BasicLayer, VectorRBFLayer} import com.github.nearbydelta.deepspark.network.SimpleNetwork import com.github.nearbydelta.deepspark.train.{TrainerBuilder, TrainingParam} import org.apache.spark.storage.StorageLevel import org.apache.spark.{SparkConf, SparkContext} object TestXOR { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local[5]").setAppName("TestXOR") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.broadcast.blockSize", "40960") .set("spark.akka.frameSize", "50") val sc = new SparkContext(conf) val data = (0 to 100).collect { case i if i > 75 || i < 25 ⇒ (0 to 100).collect { case j if j > 75 || j < 25 ⇒ val xor = if (i > 75 && j < 25) true else if (i < 25 && j > 75) true else false (DenseVector[Double](i / 100.0, j / 100.0), xor) } }.flatMap(x ⇒ x) val train = sc.makeRDD(data) val test = train try { Weight.scalingDownBy(10.0) val builder = new AdaGrad(l2decay = 0.001, rate = 0.01) val rbf = new VectorRBFLayer withActivation GaussianRBF withCenters Seq(DenseVector(1.0, 1.0), DenseVector(0.0, 0.0), DenseVector(1.0, 0.0), DenseVector(0.0, 1.0)) val network = new SimpleNetwork[Boolean]() // .add(new BasicLayer withInput 2 withOutput 4) .add(rbf) // .add(new BasicLayer withActivation LeakyReLU withOutput 4) .add(new BasicLayer withActivation SoftmaxCEE withOutput 2) .initiateBy(builder) println(rbf.epsilon.value) require(network.NOut == 2) // require(network.layers.head.asInstanceOf[BasicLayer].bias != null) // require(network.layers.head.asInstanceOf[BasicLayer].weight.value != null) // require(network.layers.head.asInstanceOf[BasicLayer].bias.value.length > 0) val trained = new TrainerBuilder(TrainingParam(miniBatch = 10, maxIter = 100, dataOnLocal = true, reuseSaveData = true, storageLevel = StorageLevel.MEMORY_ONLY)) .build(network, train, test, CrossEntropyErr, (x: Boolean) ⇒ if (x) DenseVector(1.0, 0.0) else DenseVector(0.0, 1.0), "XORTest") .getTrainedNetwork println(rbf.epsilon.value) (0 until 10).foreach { _ ⇒ val (in, exp) = data(Math.floor(Math.random() * data.length).toInt) val out = trained.predictSoft(in) println(s"IN : $in, EXPECTED: $exp, OUTPUT ${out(0) > out(1)} $out") } } finally { sc.stop() } } }
Example 35
Source File: TestConcat.scala From deepspark with GNU General Public License v2.0 | 5 votes |
import breeze.linalg.DenseVector import com.github.nearbydelta.deepspark.data._ import com.github.nearbydelta.deepspark.layer.{BasicLayer, NetworkConcatLayer} import com.github.nearbydelta.deepspark.network.{GeneralNetwork, SimpleNetwork} import com.github.nearbydelta.deepspark.train.{TrainerBuilder, TrainingParam} import org.apache.spark.storage.StorageLevel import org.apache.spark.{SparkConf, SparkContext} import scala.reflect.{ClassTag, classTag} object TestConcat { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local[5]").setAppName("TestXOR") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.broadcast.blockSize", "40960") .set("spark.akka.frameSize", "50") val sc = new SparkContext(conf) val data = (0 to 10).collect { case i if i > 7 || i < 3 ⇒ (0 to 10).collect { case j if j > 7 || j < 3 ⇒ val xor = if (i > 7 && j > 7) true else if (i < 3 && j < 3) true else false (0 to 10).collect { case k if k > 7 || k < 3 ⇒ (0 to 10).collect { case l if l > 7 || l < 3 ⇒ val xor2 = if (i > 7 && j > 7) true else if (i < 3 && j < 3) true else false (Array(DenseVector(i / 10.0, j / 10.0), DenseVector(k / 10.0, l / 10.0)), xor && xor2) } }.flatMap(x ⇒ x) }.flatMap(x ⇒ x) }.flatMap(x ⇒ x) val train = sc.makeRDD(data) val test = train try { val builder = new AdaGrad(l2decay = 0.00001, rate = 0.01) val input1 = new SimpleNetwork[DataVec]() .add(new BasicLayer withInput 2 withOutput 4) .add(new BasicLayer withInput 4 withOutput 1) val input2 = new SimpleNetwork[DataVec]() .add(new BasicLayer withInput 2 withOutput 4) .add(new BasicLayer withInput 4 withOutput 1) val concat = new ConcatLayer().addNetwork(input1).addNetwork(input2) val network = new GeneralNetwork[Array[DataVec], Boolean](concat) .add(new BasicLayer withInput 2 withOutput 4) .add(new BasicLayer withInput 4 withOutput 1) .initiateBy(builder) require(network.NOut == 1) val trained = new TrainerBuilder(TrainingParam(miniBatch = 10, maxIter = 1000, storageLevel = StorageLevel.MEMORY_ONLY)) .build(network, train, test, SquaredErr, (x: Boolean) ⇒ if (x) DenseVector(1.0) else DenseVector(0.0), "XORTest") .getTrainedNetwork (0 until 10).foreach { _ ⇒ val (in, exp) = data(Math.floor(Math.random() * data.length).toInt) val out = trained.predictSoft(in) println(s"IN : $in, EXPECTED: $exp, OUTPUT $out") } } finally { sc.stop() } } class ConcatLayer extends NetworkConcatLayer[DataVec] { override implicit protected val evidenceI: ClassTag[Array[DataVec]] = classTag[Array[DataVec]] } }
Example 36
Source File: TestSpeed.scala From deepspark with GNU General Public License v2.0 | 5 votes |
import breeze.linalg.DenseVector import com.github.nearbydelta.deepspark.data._ import com.github.nearbydelta.deepspark.layer.{BasicLayer, VectorRBFLayer} import com.github.nearbydelta.deepspark.network.SimpleNetwork import com.github.nearbydelta.deepspark.train.{TrainerBuilder, TrainingParam} import org.apache.spark.storage.StorageLevel import org.apache.spark.{SparkConf, SparkContext} object TestSpeed { def main(args: Array[String]) { val conf = new SparkConf().setMaster("local[5]").setAppName("TestXOR") .set("spark.serializer", "org.apache.spark.serializer.KryoSerializer") .set("spark.broadcast.blockSize", "40960") .set("spark.akka.frameSize", "50") val sc = new SparkContext(conf) val data = (0 to 100).collect { case i if i > 75 || i < 25 ⇒ (0 to 100).collect { case j if j > 75 || j < 25 ⇒ val xor = if (i > 75 && j < 25) true else if (i < 25 && j > 75) true else false (DenseVector[Double](i / 100.0, j / 100.0), xor) } }.flatMap(x ⇒ x) val train = sc.makeRDD(data) val test = train try { Weight.scalingDownBy(10.0) val builder = new AdaGrad(l2decay = 0.001, rate = 0.01) val rbf = new VectorRBFLayer withActivation GaussianRBF withCenters Seq(DenseVector(1.0, 1.0), DenseVector(0.0, 0.0), DenseVector(1.0, 0.0), DenseVector(0.0, 1.0)) val network = new SimpleNetwork[Boolean]() // .add(new BasicLayer withInput 2 withOutput 4) .add(rbf) // .add(new BasicLayer withActivation LeakyReLU withOutput 4) .add(new BasicLayer withActivation SoftmaxCEE withOutput 2) .initiateBy(builder) println(rbf.epsilon.value) require(network.NOut == 2) // require(network.layers.head.asInstanceOf[BasicLayer].bias != null) // require(network.layers.head.asInstanceOf[BasicLayer].weight.value != null) // require(network.layers.head.asInstanceOf[BasicLayer].bias.value.length > 0) val trained = new TrainerBuilder(TrainingParam(miniBatch = 10, maxIter = 100, dataOnLocal = true, reuseSaveData = true, storageLevel = StorageLevel.MEMORY_ONLY)) .build(network, train, test, CrossEntropyErr, (x: Boolean) ⇒ if (x) DenseVector(1.0, 0.0) else DenseVector(0.0, 1.0), "XORTest") .getTrainedNetwork println(rbf.epsilon.value) (0 until 10).foreach { _ ⇒ val (in, exp) = data(Math.floor(Math.random() * data.length).toInt) val out = trained.predictSoft(in) println(s"IN : $in, EXPECTED: $exp, OUTPUT ${out(0) > out(1)} $out") } } finally { sc.stop() } } }
Example 37
Source File: min-ppl-examples.scala From blog with Apache License 2.0 | 5 votes |
object MinPplExamples2 { import MinPpl2._ import breeze.stats.{meanAndVariance => meanVar} import breeze.linalg.DenseVector import cats._ import cats.implicits._ import cats.syntax._ // Zip vs flatMap def example1 = { println("binding with for") val prior1 = for { x <- Normal(0,1) y <- Gamma(1,1) z <- Poisson(10) } yield (x,y,z) println(meanVar(prior1.empirical.map(_._2))) println("binding with flatMap") val prior2 = Normal(0,1) flatMap {x => Gamma(1,1) flatMap {y => Poisson(10) map {z => (x,y,z)}}} println(meanVar(prior2.empirical.map(_._2))) println("tupling") val prior3 = Applicative[Prob].tuple3(Normal(0,1), Gamma(1,1), Poisson(10)) println(meanVar(prior3.empirical.map(_._2))) print("done") } // Poisson DGLM def example2 = { val data = List(2,1,0,2,3,4,5,4,3,2,1) val prior = for { w <- Gamma(1, 1) state0 <- Normal(0.0, 2.0) } yield (w, List(state0)) def addTimePointSimple(current: Prob[(Double, List[Double])], obs: Int): Prob[(Double, List[Double])] = { println(s"Conditioning on observation: $obs") val updated = for { tup <- current (w, states) = tup os = states.head ns <- Normal(os, w) _ <- Poisson(math.exp(ns)).fitQ(obs) } yield (w, ns :: states) updated.resample } def addTimePoint(current: Prob[(Double, List[Double])], obs: Int): Prob[(Double, List[Double])] = { println(s"Conditioning on observation: $obs") val predict = for { tup <- current (w, states) = tup os = states.head ns <- Normal(os, w) } yield (w, ns :: states) val updated = for { tup <- predict (w, states) = tup st = states.head _ <- Poisson(math.exp(st)).fitQ(obs) } yield (w, states) updated.resample } val mod = data.foldLeft(prior)(addTimePoint(_,_)).empirical print("w : ") println(meanVar(mod map (_._1))) print("s0 : ") println(meanVar(mod map (_._2.reverse.head))) print("sN : ") println(meanVar(mod map (_._2.head))) } // Main entry point def main(args: Array[String]): Unit = { println("Hi") //example1 example2 println("Bye") } } // eof
Example 38
Source File: AverageLedger.scala From deepspark with GNU General Public License v2.0 | 5 votes |
package com.github.nearbydelta.deepspark.word.layer import breeze.linalg.{DenseVector, axpy} import com.github.nearbydelta.deepspark.data._ import com.github.nearbydelta.deepspark.word.{LedgerBuilder, LedgerModel} import scala.collection.parallel.ParSeq class AverageLedger extends Ledger[DataVec] { override val outVecOf: (DataVec) ⇒ DataVec = x ⇒ x override def apply(x: Array[Int]): DataVec = { if (x.nonEmpty) { val matrix = DenseVector.zeros[Double](NOut) val it = x.toIterator val factor = 1.0 / x.length while (it.hasNext) { axpy(factor, vectorOf(it.next()), matrix) } matrix } else pad } override def backprop(seq: ParSeq[((Array[Int], DataVec), DataVec)]): (ParSeq[DataVec], ParSeq[() ⇒ Unit]) = { seq.foreach { case ((in, _), err) ⇒ if (in.nonEmpty) { err :/= in.length.toDouble val it = in.iterator while (it.hasNext) { updateWord(it.next(), err) } } else updateWord(padID, err) } (null, ParSeq(algorithm.update)) } override def withModel(model: LedgerModel, builder: LedgerBuilder): this.type = { NOut = model.dimension super.withModel(model, builder) } }
Example 39
Source File: BatchModeTest.scala From neuroflow with Apache License 2.0 | 5 votes |
import org.scalatest.FunSuite import breeze.linalg.DenseVector import neuroflow.core.Activators.Double._ import neuroflow.core._ import neuroflow.dsl._ class BatchModeTest extends FunSuite { test("Batch Mode for Dense Net CPU") { import neuroflow.nets.cpu.DenseNetwork._ implicit val weights = WeightBreeder[Double].random(-1, 1) val f = Sigmoid val net = Network(layout = Vector(2) :: Dense(3, f) :: Dense(10, f) :: SquaredError()) val batch = (1 to 100).map { _ => DenseVector.rand[Double](size = 2) } val res = net.batchApply(batch) assert(res.size == batch.size) } test("Batch Mode for Conv Net CPU") { import neuroflow.nets.cpu.ConvNetwork._ implicit val weights = WeightBreeder[Double].random(-1, 1) val f = Sigmoid val net = Network(layout = Convolution((1, 2, 1), (0, 0), (1, 2), (1, 1), 3, f) :: Dense(10, f) :: SquaredError() ) val batch = (1 to 100).map { _ => Tensor3D.fromVector(DenseVector.rand[Double](size = 2)) } val res = net.batchApply(batch) assert(res.size == batch.size) } test("Batch Mode for Dense Net GPU") { import neuroflow.nets.gpu.DenseNetwork._ implicit val weights = WeightBreeder[Double].random(-1, 1) val f = Sigmoid val net = Network(layout = Vector(2) :: Dense(3, f) :: Dense(10, f) :: SquaredError()) val batch = (1 to 100).map { _ => DenseVector.rand[Double](size = 2) } val res = net.batchApply(batch) assert(res.size == batch.size) } test("Batch Mode for Conv Net GPU") { import neuroflow.nets.gpu.ConvNetwork._ implicit val weights = WeightBreeder[Double].random(-1, 1) val f = Sigmoid val net = Network(layout = Convolution((1, 2, 1), (0, 0), (1, 2), (1, 1), 3, f) :: Dense(10, f) :: SquaredError() ) val batch = (1 to 100).map { _ => Tensor3D.fromVector(DenseVector.rand[Double](size = 2)) } val res = net.batchApply(batch) assert(res.size == batch.size) } }
Example 40
Source File: BatchBreeder.scala From neuroflow with Apache License 2.0 | 5 votes |
package neuroflow.core import breeze.linalg.{DenseMatrix, DenseVector} import breeze.storage.Zero import neuroflow.common.Logs import scala.reflect.ClassTag def breedCNN[V: ClassTag : Zero](xs: Seq[Tensor3D[V]], ys: Seq[DenseVector[V]], batchSize: Int): (Seq[(DenseMatrix[V], DenseMatrix[V])], Map[Int, Int]) = { val xsys = xs.zip(ys).grouped(batchSize).zipWithIndex.toSeq.par.map { case (xy, batchNo) => val x = horzCatTensorBatch(xy.map(_._1)) val y = DenseMatrix.zeros[V](xy.size, xy.head._2.length) (0 until y.rows).foreach { row => (0 until y.cols).foreach { col => y.update(row, col, xy(row)._2(col)) } } debug(s"Bred Batch $batchNo.") (x -> y) -> xy.size }.seq xsys.map(_._1) -> xsys.zipWithIndex.map(b => b._2 -> b._1._2).toMap } def vertCatVectorBatch[V: ClassTag : Zero](xs: Seq[DenseVector[V]]): DenseMatrix[V] = { val x = DenseMatrix.zeros[V](xs.size, xs.head.length) (0 until x.rows).foreach { row => (0 until x.cols).foreach { col => x.update(row, col, xs(row)(col)) } } x } def horzCatTensorBatch[V: ClassTag : Zero](ts: Seq[Tensor3D[V]]): DenseMatrix[V] = { val x = DenseMatrix.zeros[V](ts.head.matrix.rows, ts.head.matrix.cols * ts.size) (0 until x.rows).foreach { row => (0 until x.cols).foreach { col => val b = col / ts.head.matrix.cols val c = col % ts.head.matrix.cols x.update(row, col, ts(b).matrix(row, c)) } } x } def unsliceMatrixByRow[V: ClassTag : Zero](m: DenseMatrix[V]): Seq[DenseVector[V]] = { (0 until m.rows).map { r => val v = m(r, ::).t v } } }
Example 41
Source File: Tensor.scala From neuroflow with Apache License 2.0 | 5 votes |
package neuroflow.core import breeze.linalg.{DenseMatrix, DenseVector} import breeze.math.Semiring import breeze.storage.Zero import scala.reflect.ClassTag def deepCat[V: ClassTag : Zero](ts: Seq[Tensor3D[V]]): Tensor3D[V] = { val x = ts.head.X val y = ts.head.Y val z = ts.map(_.Z).sum require(ts.forall(t => t.X == x && t.Y == y), "All tensors must share same dimension X, Y!") val mergedMat = ts.map(_.matrix).reduce((a, b) => DenseMatrix.vertcat(a, b)) new Tensor3DImpl[V](mergedMat, X = x, Y = y, Z = z) } } class Tensor3DImpl[V](val matrix: DenseMatrix[V], val X: Int, val Y: Int, val Z: Int) extends Tensor3D[V] { def mapAll[T: ClassTag : Zero](f: V => T): Tensor3D[T] = { new Tensor3DImpl(matrix.map(f), X, Y, Z) } def mapAt(x: (Int, Int, Int))(f: V => V): Tensor3D[V] = { val newMat = matrix.copy val (row, col) = projection(x._1, x._2, x._3) newMat.update(row, col, f(apply(x))) new Tensor3DImpl(newMat, X, Y, Z) } def updateAt(x: (Int, Int, Int))(v: V): Unit = { val (row, col) = projection(x._1, x._2, x._3) matrix.update(row, col, v) } }
Example 42
Source File: ActiveShapeModelIOTests.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.io import java.io.File import java.net.URLDecoder import breeze.linalg.{DenseMatrix, DenseVector} import scalismo.ScalismoTestSuite import scalismo.numerics.FixedPointsUniformMeshSampler3D import scalismo.statisticalmodel.MultivariateNormalDistribution import scalismo.statisticalmodel.asm._ import scalismo.utils.Random import scala.collection.immutable class ActiveShapeModelIOTests extends ScalismoTestSuite { implicit val rng = Random(42L) private def createTmpH5File(): File = { val f = File.createTempFile("hdf5file", ".h5") f.deleteOnExit() f } private def createAsm(): ActiveShapeModel = { val statismoFile = new File(URLDecoder.decode(getClass.getResource("/facemodel.h5").getPath, "UTF-8")) val shapeModel = StatismoIO.readStatismoMeshModel(statismoFile).get val (sprofilePoints, _) = new FixedPointsUniformMeshSampler3D(shapeModel.referenceMesh, 100).sample.unzip val pointIds = sprofilePoints.map { point => shapeModel.referenceMesh.pointSet.findClosestPoint(point).id } val dists = for (i <- pointIds.indices) yield new MultivariateNormalDistribution(DenseVector.ones[Double](3) * i.toDouble, DenseMatrix.eye[Double](3) * i.toDouble) val profiles = new Profiles(pointIds.to[immutable.IndexedSeq].zip(dists).map { case (i, d) => Profile(i, d) }) new ActiveShapeModel(shapeModel, profiles, GaussianGradientImagePreprocessor(1), NormalDirectionFeatureExtractor(1, 1)) } describe("An active shape model") { it("can be written to disk and read again") { val originalAsm = createAsm() val h5file = createTmpH5File() ActiveShapeModelIO.writeActiveShapeModel(originalAsm, h5file).get val newAsm = ActiveShapeModelIO.readActiveShapeModel(h5file).get newAsm should equal(originalAsm) h5file.delete() } } }
Example 43
Source File: LandmarkIOTests.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.io import java.io.{ByteArrayOutputStream, File, InputStream} import java.net.URLDecoder import breeze.linalg.DenseVector import scalismo.ScalismoTestSuite import scalismo.geometry._ import scalismo.statisticalmodel.MultivariateNormalDistribution import scala.io.Source import scala.language.implicitConversions import scala.collection.immutable.Seq class LandmarkIOTests extends ScalismoTestSuite { implicit def doubleToFloat(d: Double): Float = d.toFloat implicit def inputStreamToSource(s: InputStream): Source = Source.fromInputStream(s) describe("Spray LandmarkIO") { val csvName = "/landmarks.csv" def csvStream() = getClass.getResourceAsStream(csvName) val jsonName = "/landmarks.json" def jsonStream() = getClass.getResourceAsStream(jsonName) def distWithDefaultVectors(d1: Double, d2: Double, d3: Double): MultivariateNormalDistribution = { val axes = List(DenseVector[Double](1, 0, 0), DenseVector[Double](0, 1, 0), DenseVector[Double](0, 0, 1)) val devs = List(d1, d2, d3) val data = axes zip devs MultivariateNormalDistribution(DenseVector[Double](0, 0, 0), data) } val jsonLm1 = Landmark("one", Point(1, 2, 3)) val jsonLm2 = Landmark("two", Point(2, 3, 4), Some("Landmark two"), Some(distWithDefaultVectors(1, 4, 9))) val jsonLms = List(jsonLm1, jsonLm2) it("can serialize and deserialize simple landmarks using JSON") { val out = new ByteArrayOutputStream() LandmarkIO.writeLandmarksJsonToStream(jsonLms, out) val written = new String(out.toByteArray) val read = LandmarkIO.readLandmarksJsonFromSource[_3D](Source.fromString(written)).get read should equal(jsonLms) } it("can read simple landmarks from a JSON Stream") { val read = LandmarkIO.readLandmarksJsonFromSource[_3D](jsonStream()).get read should equal(jsonLms) } } }
Example 44
Source File: ImageTests.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.image import breeze.linalg.DenseVector import scalismo.ScalismoTestSuite import scalismo.common.{BoxDomain, PointId, Scalar, ScalarArray} import scalismo.geometry.IntVector.implicits._ import scalismo.geometry.Point.implicits._ import scalismo.geometry.EuclideanVector.implicits._ import scalismo.geometry._ import scalismo.registration.TranslationSpace import scala.language.implicitConversions import scala.reflect.ClassTag class ImageTests extends ScalismoTestSuite { implicit def arrayToScalarArray[A: Scalar: ClassTag](a: Array[A]): ScalarArray[A] = ScalarArray(a) describe("A discrete 1D image") { it("returns the same points for a 1d index and a coordinate index") { val domain = DiscreteImageDomain[_1D](0.0, 1.0, 5) val discreteImage = DiscreteScalarImage(domain, Seq(3.0, 2.0, 1.5, 1, 0)) for (i <- 0 until domain.size(0)) { assert(discreteImage(i) == discreteImage(i)) } } } describe("A discrete 2D image") { it("returns the same points for a 1d index and a (2d) coordinate index") { val domain = DiscreteImageDomain[_2D]((0.0, 0.0), (1.0, 2.0), (3, 2)) val discreteImage = DiscreteScalarImage(domain, Seq(3.0, 2.0, 1.5, 1.0, 0.0, 4.0)) for (y <- 0 until domain.size(1); x <- 0 until domain.size(0)) { assert(discreteImage(PointId(y * domain.size(0) + x)) === discreteImage((x, y))) } } } describe("A continuous 1D image") { it("yields the right values after composing with a translation") { val image = DifferentiableScalarImage(BoxDomain(-4.0, 6.0), (x: Point[_1D]) => Math.sin(x(0).toDouble).toFloat, (x: Point[_1D]) => EuclideanVector(Math.cos(x(0).toDouble).toFloat)) val translationTransform = TranslationSpace[_1D].transformForParameters(DenseVector(1.0)) val composedImage = image.compose(translationTransform) assert(composedImage.isDefinedAt(-4.0) === true) assert(composedImage.isDefinedAt(5.0) === true) assert(composedImage.isDefinedAt(-4.5) === true) assert(composedImage.isDefinedAt(5.5) === false) composedImage(0.0) should be(image(1.0) +- 1e-5f) } it("yields the right values after warping with a translation") { val image = DifferentiableScalarImage(BoxDomain(-4.0, 6.0), (x: Point[_1D]) => Math.sin(x(0).toDouble).toFloat, (x: Point[_1D]) => EuclideanVector(Math.cos(x(0).toDouble).toFloat)) val translationTransform = TranslationSpace[_1D].transformForParameters(DenseVector(-1.0)) val warpedImage = image.compose(translationTransform) warpedImage.isDefinedAt(-4.0) should equal(false) warpedImage.isDefinedAt(-3.0) should equal(true) warpedImage.isDefinedAt(5.0) should equal(true) warpedImage.isDefinedAt(-3.5) should equal(false) warpedImage.isDefinedAt(5.5) should equal(true) warpedImage.isDefinedAt(6.5) should equal(true) warpedImage.isDefinedAt(7.0) should equal(true) warpedImage(0.0) should be(image(-1.0) +- 1e-5f) } } describe("A continuous 2D image") { it("can be translated to a new place") { val cImg = ScalarImage(BoxDomain((0.0, 0.0), (1.0, 1.0)), (_: Point[_2D]) => 1f) def t = TranslationSpace[_2D].transformForParameters(DenseVector(2.0, 2.0)) val warpedImg = cImg.compose(t) warpedImg.isDefinedAt((-0.5, -0.5)) should equal(false) warpedImg.isDefinedAt((-2.5, -2.5)) should equal(false) warpedImg.isDefinedAt((-1.5, -1.5)) should equal(true) warpedImg((-1.5, -1.5)) should be(1.0) } } }
Example 45
Source File: MeshTests.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.mesh import java.io.File import java.net.URLDecoder import breeze.linalg.DenseVector import scalismo.ScalismoTestSuite import scalismo.common.{PointId, UnstructuredPointsDomain} import scalismo.geometry.Point.implicits._ import scalismo.geometry.{_3D, Point} import scalismo.io.MeshIO import scalismo.registration.{RotationSpace, ScalingSpace} import scala.language.implicitConversions class MeshTests extends ScalismoTestSuite { implicit def doubleToFloat(d: Double): Float = d.toFloat implicit def intToPointId(i: Int): PointId = PointId(i) describe("a mesh") { val path = getClass.getResource("/facemesh.stl").getPath val facemesh = MeshIO.readMesh(new File(URLDecoder.decode(path, "UTF-8"))).get it("finds the right closest points for all the points that define the mesh") { for ((pt, id) <- facemesh.pointSet.points.zipWithIndex) { val ptWithID = facemesh.pointSet.findClosestPoint(pt) val closestPt = ptWithID.point val closestId = ptWithID.id assert(closestPt === pt) assert(closestId.id === id) } } it("finds the right closest point for a point that is not defined on the mesh") { val pts = IndexedSeq(Point(0.0, 0.0, 0.0), Point(1.0, 1.0, 1.0), Point(1.0, 1.0, 5.0)) val cells = IndexedSeq(TriangleCell(0, 1, 2)) val mesh = TriangleMesh3D(UnstructuredPointsDomain(pts), TriangleList(cells)) val newPt = Point(1.1, 1.1, 4) val ptWithID = mesh.pointSet.findClosestPoint(newPt) val closestPt = ptWithID.point val closestPtId = ptWithID.id assert(closestPtId.id === 2) assert(closestPt === pts(2)) } it("computes its area correctly for a triangle") { val pts: IndexedSeq[Point[_3D]] = IndexedSeq((0.0, 0.0, 0.0), (0.0, 1.0, 0.0), (1.0, 0.0, 0.0)) val cells = IndexedSeq(TriangleCell(0, 1, 2)) val mesh = TriangleMesh3D(UnstructuredPointsDomain(pts), TriangleList(cells)) val R = RotationSpace[_3D]((0.0, 0.0, 0.0)).transformForParameters(DenseVector(0.3, 0.4, 0.1)) val s = ScalingSpace[_3D].transformForParameters(DenseVector(2.0)) val transformedMesh = mesh.transform(R).transform(s) mesh.area should be(0.5 +- 1e-8) transformedMesh.area should be(4.0f * mesh.area +- 1e-5) // scaling by two gives 4 times the area } it("computes the right binary image for the unit sphere") { val path = getClass.getResource("/unit-sphere.stl").getPath val spheremesh = MeshIO.readMesh(new File(URLDecoder.decode(path, "UTF-8"))).get val binaryImg = spheremesh.operations.toBinaryImage binaryImg(Point(0, 0, 0)) should be(1) binaryImg(Point(2, 0, 0)) should be(0) } it("can have an empty cell list") { val pts = IndexedSeq(Point(0.0, 0.0, 0.0), Point(1.0, 1.0, 1.0), Point(1.0, 1.0, 5.0)) val cells = IndexedSeq[TriangleCell]() try { TriangleMesh3D(UnstructuredPointsDomain(pts), TriangleList(cells)) // would throw exception on fail } catch { case e: Exception => fail("It should be possible to create triangleMesh with an empty cell list") } } } }
Example 46
Source File: PivotedCholeskyTest.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.numerics import breeze.linalg.DenseVector import scalismo.ScalismoTestSuite import scalismo.common.BoxDomain3D import scalismo.geometry.{_1D, _3D, Point} import scalismo.kernels.{DiagonalKernel, GaussianKernel, Kernel} import scalismo.utils.Random class PivotedCholeskyTest extends ScalismoTestSuite { implicit val rng = Random(42L) describe("The Pivoted Cholesky ") { it("accurately approximates a covariance matrix from a random set of points and a kernel k in 1D") { val pts = DenseVector.rand[Double](60).toArray.map(v => Point(v.toFloat)) val k = GaussianKernel[_1D](1.0) val matrixValuedK = DiagonalKernel[_1D](k, 1) val m = Kernel.computeKernelMatrix[_1D](pts, matrixValuedK) val eigCholesky = PivotedCholesky.computeApproximateEig(matrixValuedK, pts, PivotedCholesky.RelativeTolerance(1e-15)) val (u, d) = eigCholesky val D = (u * breeze.linalg.diag(d) * u.t) - m Math.sqrt(breeze.linalg.trace(D * D.t)) should be <= 1e-5 } it("accurately approximates a covariance matrix from a random set of points and a kernel k in 3D") { val boxDomain = BoxDomain3D(Point(0.0, 0.0, 0.0), Point(1.0, 1.0, 1.0)) val uniformSampler = UniformSampler[_3D](boxDomain, 20) val pts = uniformSampler.sample.map(_._1) val k = GaussianKernel[_3D](1.0) val matrixValuedK = DiagonalKernel[_3D](k, 3) val m = Kernel.computeKernelMatrix[_3D](pts, matrixValuedK) val eigCholesky = PivotedCholesky.computeApproximateEig(matrixValuedK, pts, PivotedCholesky.RelativeTolerance(1e-15)) val (u, d) = eigCholesky val D = (u * breeze.linalg.diag(d) * u.t) - m Math.sqrt(breeze.linalg.trace(D * D.t)) should be <= 1e-5 } } }
Example 47
Source File: ActiveShapeModelTests.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.statisticalmodel import java.io.File import java.net.URLDecoder import breeze.linalg.DenseVector import scalismo.ScalismoTestSuite import scalismo.geometry.{_3D, Point} import scalismo.io.{ImageIO, MeshIO, StatismoIO} import scalismo.mesh.{MeshMetrics, TriangleMesh} import scalismo.numerics.{Sampler, UniformMeshSampler3D} import scalismo.registration.LandmarkRegistration import scalismo.statisticalmodel.asm._ import scalismo.statisticalmodel.dataset.DataCollection import scalismo.utils.Random class ActiveShapeModelTests extends ScalismoTestSuite { describe("An active shape model") { implicit val random = Random(42) object Fixture { val imagePreprocessor = GaussianGradientImagePreprocessor(0.1f) // number of points should usually be an odd number, so that the profiles are centered on the profiled points val featureExtractor = NormalDirectionFeatureExtractor(numberOfPoints = 5, spacing = 1.0) def samplerPerMesh(mesh: TriangleMesh[_3D]): Sampler[_3D] = UniformMeshSampler3D(mesh, numberOfPoints = 1000) val searchMethod = NormalDirectionSearchPointSampler(numberOfPoints = 31, searchDistance = 6) val fittingConfig = FittingConfiguration(featureDistanceThreshold = 2.0, pointDistanceThreshold = 3.0, modelCoefficientBounds = 3.0) val path: String = URLDecoder.decode(getClass.getResource(s"/asmData/model.h5").getPath, "UTF-8") val shapeModel = StatismoIO.readStatismoMeshModel(new File(path)).get val nbFiles = 7 // use iterators so files are only loaded when required (and memory can be reclaimed after use) val meshes = (0 until nbFiles).toIterator map { i => val meshPath: String = getClass.getResource(s"/asmData/$i.stl").getPath MeshIO.readMesh(new File(URLDecoder.decode(meshPath, "UTF-8"))).get } val images = (0 until nbFiles).toIterator map { i => val imgPath: String = getClass.getResource(s"/asmData/$i.vtk").getPath ImageIO.read3DScalarImage[Float](new File(URLDecoder.decode(imgPath, "UTF-8"))).get } val targetImage = images.next() val targetMesh = meshes.next() val trainMeshes = meshes val trainImages = images val dc = DataCollection.fromMeshSequence(shapeModel.referenceMesh, trainMeshes.toIndexedSeq)._1.get val trainingData = trainImages zip dc.dataItems.toIterator.map(_.transformation) val asm = ActiveShapeModel.trainModel(shapeModel, trainingData, imagePreprocessor, featureExtractor, samplerPerMesh) // align the model val alignment = LandmarkRegistration.rigid3DLandmarkRegistration( (asm.statisticalModel.mean.pointSet.points zip targetMesh.pointSet.points).toIndexedSeq, Point(0, 0, 0) ) val alignedASM = asm.transform(alignment) } it("Can be built, transformed and correctly fitted from/to artificial data") { val fit = Fixture.alignedASM.fit(Fixture.targetImage, Fixture.searchMethod, 20, Fixture.fittingConfig).get.mesh assert(MeshMetrics.diceCoefficient(fit, Fixture.targetMesh) > 0.94) } it("Can be transformed correctly from within the fitting") { val nullInitialParameters = DenseVector.zeros[Double](Fixture.asm.statisticalModel.rank) val fit = Fixture.asm .fit(Fixture.targetImage, Fixture.searchMethod, 20, Fixture.fittingConfig, ModelTransformations(nullInitialParameters, Fixture.alignment)) .get .mesh assert(MeshMetrics.diceCoefficient(fit, Fixture.targetMesh) > 0.95) } } }
Example 48
Source File: StatisticalVolumeModelTests.scala From scalismo with Apache License 2.0 | 5 votes |
package scalismo.statisticalmodel.experimental import java.io.File import java.net.URLDecoder import breeze.linalg.DenseVector import breeze.stats.distributions.Gaussian import scalismo.ScalismoTestSuite import scalismo.geometry.{_3D, Point} import scalismo.io.StatismoIO import scalismo.registration.{RigidTransformation, RigidTransformationSpace} import scalismo.utils.Random class StatisticalVolumeModelTests extends ScalismoTestSuite { implicit val random = Random(42) implicit def doubleToFloat(d: Double): Float = d.toFloat describe("A statistical Volume mesh model") { def compareModels(oldModel: StatisticalVolumeMeshModel, newModel: StatisticalVolumeMeshModel) { for (i <- 0 until 10) { val standardNormal = Gaussian(0, 1)(random.breezeRandBasis) val coeffsData = standardNormal.sample(oldModel.rank) val coeffs = DenseVector(coeffsData.toArray) val inst = oldModel.instance(coeffs) val instNew = newModel.instance(coeffs) inst.pointSet.points .zip(instNew.pointSet.points) .foreach { case (pt1, pt2) => (pt1.toVector - pt2.toVector).norm should be(0.0 +- (0.1)) } } } it("can be transformed forth and back and yield the same deformations") { val path = getClass.getResource("/TetraMeshModel2.h5").getPath val model = StatismoIO.readStatismoVolumeMeshModel(new File(URLDecoder.decode(path))).get val parameterVector = DenseVector[Double](1.5, 1.0, 3.5, Math.PI, -Math.PI / 2.0, -Math.PI) val rigidTransform = RigidTransformationSpace[_3D]().transformForParameters(parameterVector) val inverseTransform = rigidTransform.inverse.asInstanceOf[RigidTransformation[_3D]] val transformedModel = model.transform(rigidTransform) val newModel = transformedModel.transform(inverseTransform) compareModels(model, newModel) } it("can change the mean shape and still yield the same shape space") { val path = getClass.getResource("/TetraMeshModel2.h5").getPath val model = StatismoIO.readStatismoVolumeMeshModel(new File(URLDecoder.decode(path))).get val newMesh = model.sample def t(pt: Point[_3D]): Point[_3D] = { val ptId = model.referenceVolumeMesh.pointSet.findClosestPoint(pt).id newMesh.pointSet.point(ptId) } val newModel = model.changeReference(t) compareModels(model, newModel) } } }
Example 49
Source File: Kernel.scala From pravda-ml with Apache License 2.0 | 5 votes |
package com.linkedin.photon.ml.hyperparameter.estimators.kernels import breeze.linalg.{DenseMatrix, DenseVector} def expandDimensions(param: DenseVector[Double], dim: Int): DenseVector[Double] = { require(param.length == 1 || param.length == dim, "Parameter must contain one global scale or a scale for each feature") if (param.length != dim) { DenseVector(Array.fill(dim)(param(0))) } else { param } } }
Example 50
Source File: MeanValueImputer.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.impute import breeze.linalg.DenseVector import breeze.stats.mean import cats.syntax.option._ import io.picnicml.doddlemodel.data.Feature.FeatureIndex import io.picnicml.doddlemodel.data.{Features, RealVector} import io.picnicml.doddlemodel.syntax.OptionSyntax._ import io.picnicml.doddlemodel.typeclasses.Transformer case class MeanValueImputer private (private[impute] val means: Option[RealVector], private val featureIndex: FeatureIndex) object MeanValueImputer { def apply(featureIndex: FeatureIndex): MeanValueImputer = MeanValueImputer(none, featureIndex) @SerialVersionUID(0L) implicit val ev: Transformer[MeanValueImputer] = new Transformer[MeanValueImputer] { override def isFitted(model: MeanValueImputer): Boolean = model.means.isDefined override def fit(model: MeanValueImputer, x: Features): MeanValueImputer = { val xToPreprocess = x(::, model.featureIndex.numerical.columnIndices) val means = DenseVector.zeros[Float](xToPreprocess.cols) 0 until xToPreprocess.cols foreach { colIndex => means(colIndex) = mean(xToPreprocess(xToPreprocess(::, colIndex).findAll(!_.isNaN), colIndex)) } model.copy(means.some) } override protected def transformSafe(model: MeanValueImputer, x: Features): Features = { val xCopy = x.copy model.featureIndex.numerical.columnIndices.zipWithIndex.foreach { case (colIndex, statisticIndex) => xCopy(::, colIndex).findAll(_.isNaN).iterator.foreach { rowIndex => xCopy(rowIndex, colIndex) = model.means.getOrBreak(statisticIndex) } } xCopy } } }
Example 51
Source File: MostFrequentValueImputer.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.impute import breeze.linalg.{DenseVector, SliceVector} import cats.syntax.option._ import io.picnicml.doddlemodel.data.Feature.FeatureIndex import io.picnicml.doddlemodel.data.{Features, RealVector} import io.picnicml.doddlemodel.syntax.OptionSyntax._ import io.picnicml.doddlemodel.typeclasses.Transformer case class MostFrequentValueImputer private (private[impute] val mostFrequent: Option[RealVector], private val featureIndex: FeatureIndex) object MostFrequentValueImputer { def apply(featureIndex: FeatureIndex): MostFrequentValueImputer = MostFrequentValueImputer(None, featureIndex) @SerialVersionUID(0L) implicit lazy val ev: Transformer[MostFrequentValueImputer] = new Transformer[MostFrequentValueImputer] { override def isFitted(model: MostFrequentValueImputer): Boolean = model.mostFrequent.isDefined override def fit(model: MostFrequentValueImputer, x: Features): MostFrequentValueImputer = { val xToPreprocess = x(::, model.featureIndex.categorical.columnIndices) val mostFrequent = DenseVector.zeros[Float](xToPreprocess.cols) 0 until xToPreprocess.cols foreach { colIndex => mostFrequent(colIndex) = getMostFrequent(xToPreprocess(xToPreprocess(::, colIndex).findAll(!_.isNaN), colIndex)) } model.copy(mostFrequent.some) } private def getMostFrequent(column: SliceVector[(Int, Int), Float]): Float = { val counts = scala.collection.mutable.Map.empty[Float, Int].withDefaultValue(0) column.foreachValue(value => counts(value) = counts(value) + 1) counts.maxBy(_._2)._1 } override protected def transformSafe(model: MostFrequentValueImputer, x: Features): Features = { val xCopy = x.copy model.featureIndex.categorical.columnIndices.zipWithIndex.foreach { case (colIndex, statisticIndex) => xCopy(::, colIndex).findAll(_.isNaN).iterator.foreach { rowIndex => xCopy(rowIndex, colIndex) = model.mostFrequent.getOrBreak(statisticIndex) } } xCopy } } }
Example 52
Source File: package.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel import breeze.linalg.{DenseMatrix, DenseVector, unique} import io.picnicml.doddlemodel.CrossScalaCompat.floatOrdering import io.picnicml.doddlemodel.data.Feature.FeatureIndex package object data { type RealVector = DenseVector[Float] type IntVector = DenseVector[Int] type Simplex = DenseMatrix[Float] type Features = DenseMatrix[Float] type Target = DenseVector[Float] type FeaturesWithIndex = (Features, FeatureIndex) type Dataset = (Features, Target) type DatasetWithIndex = (Features, Target, FeatureIndex) def loadBostonDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBostonDataset def loadBreastCancerDataset: DatasetWithIndex = ResourceDatasetLoaders.loadBreastCancerDataset def loadIrisDataset: DatasetWithIndex = ResourceDatasetLoaders.loadIrisDataset def loadHighSchoolTestDataset: DatasetWithIndex = ResourceDatasetLoaders.loadHighSchoolTestDataset def numberOfUniqueGroups(groups: IntVector): Int = { val uniqueGroups = unique(groups) require(uniqueGroups.toArray.sorted sameElements Array.range(0, uniqueGroups.length), "Invalid encoding of groups, all group indices in [0, numGroups) have to exist") uniqueGroups.length } def numberOfTargetClasses(y: Target): Int = { val targetClasses = unique(y) require(targetClasses.length >= 2, "Target variable must be comprised of at least two categories") require(targetClasses.toArray.sorted sameElements Array.range(0, targetClasses.length), "Invalid encoding of categories in the target variable") targetClasses.length } }
Example 53
Source File: LocalKMeans.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers(i) val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData val points = new HashSet[Vector[Double]] val kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println(s"Initial centers: $kPoints") while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val mappings = closest.groupBy[Int] (x => x._1) val pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints(mapping._1), mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println(s"Final centers: $kPoints") } } // scalastyle:on println
Example 54
Source File: SparkLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println(s"Final w: $w") spark.stop() } } // scalastyle:on println
Example 55
Source File: LocalFileLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val fileSrc = scala.io.Source.fromFile(args(0)) val lines = fileSrc.getLines().toArray val points = lines.map(parsePoint) val ITERATIONS = args(1).toInt // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } fileSrc.close() println(s"Final w: $w") } } // scalastyle:on println
Example 56
Source File: SparkKMeans.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{squaredDistance, DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkKMeans") .getOrCreate() val lines = spark.read.textFile(args(0)).rdd val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println(s"Finished iteration (delta = $tempDist)") } println("Final centers:") kPoints.foreach(println) spark.stop() } } // scalastyle:on println
Example 57
Source File: LocalLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println(s"Final w: $w") } } // scalastyle:on println
Example 58
Source File: SparkHdfsLR.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") val y = tok.nextToken.toDouble val x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd lines.cache() val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value val w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println(s"Initial w: $w") for (i <- 1 to ITERATIONS) { println(s"On iteration $i") val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println(s"Final w: $w") spark.stop() } } // scalastyle:on println
Example 59
Source File: BaseChangeStrategy.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.anomalydetection import breeze.linalg.DenseVector override def detect( dataSeries: Vector[Double], searchInterval: (Int, Int)) : Seq[(Int, Anomaly)] = { val (start, end) = searchInterval require(start <= end, "The start of the interval cannot be larger than the end.") val startPoint = Seq(start - order, 0).max val data = diff(DenseVector(dataSeries.slice(startPoint, end): _*), order).data data.zipWithIndex.filter { case (value, _) => (value < maxRateDecrease.getOrElse(Double.MinValue) || value > maxRateIncrease.getOrElse(Double.MaxValue)) } .map { case (change, index) => (index + startPoint + order, Anomaly(Option(dataSeries(index + startPoint + order)), 1.0, Some(s"[AbsoluteChangeStrategy]: Change of $change is not in bounds [" + s"${maxRateDecrease.getOrElse(Double.MinValue)}, " + s"${maxRateIncrease.getOrElse(Double.MaxValue)}]. Order=$order"))) } } }
Example 60
Source File: RelativeRateOfChangeStrategy.scala From deequ with Apache License 2.0 | 5 votes |
package com.amazon.deequ.anomalydetection import breeze.linalg.DenseVector override def diff(dataSeries: DenseVector[Double], order: Int): DenseVector[Double] = { require(order > 0, "Order of diff cannot be zero or negative") if (dataSeries.length == 0) { dataSeries } else { val valuesRight = dataSeries.slice(order, dataSeries.length) val valuesLeft = dataSeries.slice(0, dataSeries.length - order) valuesRight / valuesLeft } } }
Example 61
Source File: StratifiedClassifier.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.dummy.classification import breeze.linalg.DenseVector import breeze.stats.distributions.Multinomial import cats.syntax.option._ import io.picnicml.doddlemodel.CrossScalaCompat.doubleOrdering import io.picnicml.doddlemodel.data.{Features, Simplex, Target} import io.picnicml.doddlemodel.dummy.classification.StratifiedClassifier.ev import io.picnicml.doddlemodel.syntax.OptionSyntax._ import io.picnicml.doddlemodel.typeclasses.Classifier case class StratifiedClassifier private (numClasses: Option[Int], targetDistr: Option[Multinomial[DenseVector[Double], Int]]) { def getTargetDistributionParams: DenseVector[Double] = { require(ev.isFitted(this), "Called getTargetDistributionParams on a model that is not trained yet") this.targetDistr.getOrBreak.params.copy } } object StratifiedClassifier { def apply(): StratifiedClassifier = StratifiedClassifier(none, none) @SerialVersionUID(0L) implicit lazy val ev: Classifier[StratifiedClassifier] = new Classifier[StratifiedClassifier] { override def numClasses(model: StratifiedClassifier): Option[Int] = model.numClasses override def isFitted(model: StratifiedClassifier): Boolean = model.targetDistr.isDefined override protected[doddlemodel] def copy(model: StratifiedClassifier, numClasses: Int): StratifiedClassifier = model.copy(numClasses = numClasses.some) override protected def fitSafe(model: StratifiedClassifier, x: Features, y: Target): StratifiedClassifier = { val probs = y.activeValuesIterator.foldLeft(Map[Double, Int]()) { (acc, value) => val valueDouble = value.toDouble if (acc.contains(valueDouble)) acc + (valueDouble -> (acc(valueDouble) + 1)) else acc + (valueDouble -> 1) }.toArray.sortBy(_._1).map(_._2 / y.length.toDouble) model.copy(targetDistr = Multinomial[DenseVector[Double], Int](DenseVector(probs)).some) } override protected def predictSafe(model: StratifiedClassifier, x: Features): Target = DenseVector(Array.fill(x.rows)(model.targetDistr.getOrBreak.draw.toFloat)) override protected def predictProbaSafe(model: StratifiedClassifier, x: Features): Simplex = throw new NotImplementedError("Method predictProbaSafe is not defined for StratifiedClassifier") } }
Example 62
Source File: ExpectedImprovement.scala From pravda-ml with Apache License 2.0 | 5 votes |
package com.linkedin.photon.ml.hyperparameter.criteria import breeze.linalg.DenseVector import breeze.numerics.sqrt import breeze.stats.distributions.Gaussian import com.linkedin.photon.ml.hyperparameter.estimators.PredictionTransformation def apply( predictiveMeans: DenseVector[Double], predictiveVariances: DenseVector[Double]): DenseVector[Double] = { val std = sqrt(predictiveVariances) // PBO Eq. 1 val gamma = - (predictiveMeans - bestEvaluation) / std // Eq. 2 std :* ((gamma :* gamma.map(standardNormal.cdf)) + gamma.map(standardNormal.pdf)) } }
Example 63
Source File: SparkLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) //String.stripMargin 移除每行字符串开头的空格和第一个遇到的垂直分割符| } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR").setMaster("local") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value //将w初始化为一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 64
Source File: LocalFileLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions 维度 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) //解析每一行数据,生成DataPoint对像 def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() //fromFile读取文件,转换成Array[String] val lines = scala.io.Source.fromFile(args(0)).getLines().toArray //调用parsePoint解析每一行数据 val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value //初始化W到一个随机值数组 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 65
Source File: SparkKMeans.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} import org.apache.spark.SparkContext._ object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42).toArray var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } } // scalastyle:on println
Example 66
Source File: SparkHdfsLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo showWarning() val sparkConf = new SparkConf().setAppName("SparkHdfsLR").setMaster("local[2]") val inputPath = "D:\\spark\\spark-1.5.0-hadoop2.6\\data\\mllib\\lr_data.txt"//args(0) val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).cache()//缓存 val ITERATIONS = 6 //args(1).toInt 迭代次数 // Initialize w to a random value //初始化W到一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => //p代表DataPoint Vector p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 67
Source File: SparkTachyonHdfsLR.scala From spark1.52 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.hadoop.conf.Configuration import org.apache.spark._ import org.apache.spark.scheduler.InputFormatInfo import org.apache.spark.storage.StorageLevel object SparkTachyonHdfsLR { val D = 10 // Numer of dimensions val rand = new Random(42) def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD(SGD随机梯度下降) or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS(BFGS是逆秩2拟牛顿法) |for more conventional use. """.stripMargin) } case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def main(args: Array[String]) { showWarning() val inputPath = args(0) val sparkConf = new SparkConf().setAppName("SparkTachyonHdfsLR") val conf = new Configuration() val sc = new SparkContext(sparkConf, InputFormatInfo.computePreferredLocations( Seq(new InputFormatInfo(conf, classOf[org.apache.hadoop.mapred.TextInputFormat], inputPath)) )) val lines = sc.textFile(inputPath) val points = lines.map(parsePoint _).persist(StorageLevel.OFF_HEAP) val ITERATIONS = args(1).toInt // Initialize w to a random value 将w初始化为一个随机值 var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } } // scalastyle:on println
Example 68
Source File: NearestNeighbors.scala From SparkSMOTE with MIT License | 5 votes |
package utils import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import com.github.fommil.netlib.BLAS import scala.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.mutable.ArrayBuffer object NearestNeighbors { def runNearestNeighbors(data: RDD[Array[(LabeledPoint,Int,Int)]], kNN: Int, sampleData: Array[(LabeledPoint,Int,Int)]): Array[(String,Array[((Int,Int),Double)])] = { val globalNearestNeighborsByIndex = data.mapPartitionsWithIndex(localNearestNeighbors(_,_,kNN,sampleData)).groupByKey().map(x => (x._1,x._2.toArray.sortBy(r => r._2).take(kNN))).collect() globalNearestNeighborsByIndex } private def localNearestNeighbors(partitionIndex: Long, iter: Iterator[Array[(LabeledPoint,Int,Int)]], kNN: Int, sampleData: Array[(LabeledPoint,Int,Int)]): Iterator[(String,((Int,Int),Double))] = { var result = List[(String,((Int,Int),Double))]() val dataArr = iter.next val nLocal = dataArr.size - 1 val sampleDataSize = sampleData.size - 1 val kLocalNeighbors = Array.fill[distanceIndex](sampleDataSize+1)(null) for { i1 <- 0 to sampleDataSize } kLocalNeighbors(i1) = distanceIndex(sampleData(i1)._3.toInt, sampleData(i1)._2.toInt, DenseVector.zeros[Double](kNN) + Int.MaxValue.toDouble, DenseVector.zeros[Int](kNN)) for (i <- 0 to nLocal) { val currentPoint = dataArr(i) val features = currentPoint._1.features val rowId = currentPoint._3.toInt for (j <- 0 to sampleDataSize) { val samplePartitionId = sampleData(j)._2 val sampleRowId = sampleData(j)._3 val sampleFeatures = sampleData(j)._1.features if (!((rowId == sampleRowId) & (samplePartitionId == partitionIndex))) { val distance = Math.sqrt(sum((sampleFeatures - features) :* (sampleFeatures - features))) if (distance < max(kLocalNeighbors(j).distanceVector)) { val indexToReplace = argmax(kLocalNeighbors(j).distanceVector) kLocalNeighbors(j).distanceVector(indexToReplace) = distance kLocalNeighbors(j).neighborRowId(indexToReplace) = rowId } } } } for (m <- 0 to sampleDataSize){ for (l <-0 to kNN-1) { val key = kLocalNeighbors(m).partitionId.toString+","+kLocalNeighbors(m).sampleRowId.toString val tup = (partitionIndex.toInt,kLocalNeighbors(m).neighborRowId(l)) result.::=(key,(tup,kLocalNeighbors(m).distanceVector(l))) } } result.iterator } }
Example 69
Source File: loadData.scala From SparkSMOTE with MIT License | 5 votes |
package utils import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import org.apache.spark.rdd.RDD import org.apache.spark.broadcast.Broadcast object loadData { def readDelimitedData(sc: SparkContext, path: String, numFeatures: Int, delimiter: String, numPartitions: Int): RDD[(LabeledPoint,Int,Int)] = { val data = sc.textFile(path).filter{x => x.split(delimiter)(0).toDouble == 1.0}.repartition(numPartitions).mapPartitions{x => Iterator(x.toArray)} val formatData = data.mapPartitionsWithIndex{(partitionId,iter) => var result = List[(LabeledPoint,Int,Int)]() val dataArray = iter.next val dataArraySize = dataArray.size - 1 var rowCount = dataArraySize for (i <- 0 to dataArraySize) { val parts = dataArray(i).split(delimiter) result.::=((LabeledPoint(parts(0).toDouble,DenseVector(parts.slice(1,numFeatures+1)).map(_.toDouble)),partitionId.toInt,rowCount)) rowCount = rowCount - 1 } result.iterator } formatData } }
Example 70
Source File: SMOTE.scala From SparkSMOTE with MIT License | 5 votes |
package SMOTE import org.apache.spark.SparkContext import breeze.linalg._ import breeze.linalg.{DenseVector,Vector,SparseVector} import com.github.fommil.netlib.BLAS import scala.util.Random import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext import org.apache.spark.broadcast.Broadcast import scala.collection.mutable.ArrayBuffer import utils._ object SMOTE { def runSMOTE(sc: SparkContext, inPath: String, outPath: String, numFeatures: Int, oversamplingPctg: Double, kNN: Int, delimiter: String, numPartitions: Int): Unit = { val rand = new Random() val data = loadData.readDelimitedData(sc, inPath, numFeatures, delimiter, numPartitions) val dataArray = data.mapPartitions(x => Iterator(x.toArray)).cache() val numObs = dataArray.map(x => x.size).reduce(_+_) println("Number of Filtered Observations "+numObs.toString) val roundPctg = oversamplingPctg val sampleData = dataArray.flatMap(x => x).sample(withReplacement = false, fraction = roundPctg, seed = 1L).collect().sortBy(r => (r._2,r._3)) //without Replacement println("Sample Data Count "+sampleData.size.toString) val globalNearestNeighbors = NearestNeighbors.runNearestNeighbors(dataArray, kNN, sampleData) var randomNearestNeighbor = globalNearestNeighbors.map(x => (x._1.split(",")(0).toInt,x._1.split(",")(1).toInt,x._2(rand.nextInt(kNN)))).sortBy(r => (r._1,r._2)) var sampleDataNearestNeighbors = randomNearestNeighbor.zip(sampleData).map(x => (x._1._3._1._1, x._1._2, x._1._3._1._2, x._2._1)) val syntheticData = dataArray.mapPartitionsWithIndex(createSyntheticData(_,_,sampleDataNearestNeighbors,delimiter)).persist() println("Synthetic Data Count "+syntheticData.count.toString) val newData = syntheticData.union(sc.textFile(inPath)) println("New Line Count "+newData.count.toString) newData.saveAsTextFile(outPath) } private def createSyntheticData(partitionIndex: Long, iter: Iterator[Array[(LabeledPoint,Int,Int)]], sampleDataNN: Array[(Int,Int,Int,LabeledPoint)], delimiter: String): Iterator[String] = { var result = List[String]() val dataArr = iter.next val nLocal = dataArr.size - 1 val sampleDataNNSize = sampleDataNN.size - 1 val rand = new Random() for (j <- 0 to sampleDataNNSize){ val partitionId = sampleDataNN(j)._1 val neighborId = sampleDataNN(j)._3 val sampleFeatures = sampleDataNN(j)._4.features if (partitionId == partitionIndex.toInt){ val currentPoint = dataArr(neighborId) val features = currentPoint._1.features sampleFeatures += (sampleFeatures - features) * rand.nextDouble result.::=("1.0"+delimiter+sampleFeatures.toArray.mkString(delimiter)) } } result.iterator } }
Example 71
Source File: LocalKMeans.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } }
Example 72
Source File: SparkLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{Vector, DenseVector} import org.apache.spark._ object SparkLR { val N = 10000 // Number of data points val D = 10 // Numer of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D){rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val sparkConf = new SparkConf().setAppName("SparkLR") val sc = new SparkContext(sparkConf) val numSlices = if (args.length > 0) args(0).toInt else 2 val points = sc.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) sc.stop() } }
Example 73
Source File: TestingUtils.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel import breeze.linalg.{DenseMatrix, DenseVector, convert, zipValues} import breeze.stats.distributions.Rand import io.picnicml.doddlemodel.data.{Dataset, RealVector} import org.scalactic.Equality trait TestingUtils { implicit lazy val randomUniform: Rand[Float] = new Rand[Float] { override def draw(): Float = Rand.uniform.draw().toFloat } def breezeEqual(x0: DenseMatrix[Float], x1: DenseMatrix[Float])(implicit tol: Equality[Float]): Boolean = breezeEqual(x0.toDenseVector, x1.toDenseVector) def breezeEqual(x0: RealVector, x1: RealVector)(implicit tol: Equality[Float]): Boolean = zipValues(x0, x1).forall((v0, v1) => (v0.isNaN && v1.isNaN) || tol.areEquivalent(v0, v1)) def gradApprox(func: RealVector => Float, x: RealVector, h: Double = 1e-3): RealVector = { // two-sided finite differences val grad = DenseVector.zeros[Double](x.length) for ((i, _) <- x.activeIterator) { val xPlusH = convert(x.copy, Double) xPlusH(i) += h val xMinusH = convert(x.copy, Double) xMinusH(i) -= h grad(i) = (func(convert(xPlusH, Float)) - func(convert(xMinusH, Float)).toDouble) / (2.0 * h) } convert(grad, Float) } def dummyData(nRows: Int, nCols: Int = 1): Dataset = (DenseMatrix.zeros[Float](nRows, nCols), convert(DenseVector((0 until nRows).toArray), Float)) }
Example 74
Source File: GroupKFoldSplitterTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.modelselection import breeze.linalg.DenseVector import io.picnicml.doddlemodel.TestingUtils import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class GroupKFoldSplitterTest extends AnyFlatSpec with Matchers with TestingUtils { "GroupKFoldSplitter" should "split data so that folds are i.i.d" in { val (x, y) = dummyData(10) val groups = DenseVector(1, 2, 2, 0, 0, 0, 2, 1, 1, 2) val splitter = GroupKFoldSplitter(numFolds = 3) val splits = splitter.splitData(x, y, groups) val noGroupsInTrainTestSplits = splits.forall { split => val trGroups = split.yTr.map(x => groups(x.toInt)).toArray val teGroups = split.yTe.map(x => groups(x.toInt)).toArray trGroups.forall(trGroup => !teGroups.contains(trGroup)) } noGroupsInTrainTestSplits shouldBe true } }
Example 75
Source File: LinearClassifierTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.linear import breeze.linalg.{DenseMatrix, DenseVector} import breeze.numerics.sigmoid import cats.syntax.option._ import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.{Features, RealVector, Simplex, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearClassifier import org.scalatest.OptionValues import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers case class DummyLinearClassifier(numClasses: Option[Int], w: Option[RealVector]) class LinearClassifierTest extends AnyFlatSpec with Matchers with OptionValues with TestingUtils { val ev: LinearClassifier[DummyLinearClassifier] = new LinearClassifier[DummyLinearClassifier] { override def numClasses(model: DummyLinearClassifier): Option[Int] = model.numClasses override protected def w(model: DummyLinearClassifier): Option[RealVector] = model.w override protected[doddlemodel] def copy(model: DummyLinearClassifier, numClasses: Int): DummyLinearClassifier = model.copy(numClasses = numClasses.some) override protected def copy(model: DummyLinearClassifier, w: RealVector): DummyLinearClassifier = model.copy(w = w.some) override protected def predictStateless(model: DummyLinearClassifier, w: RealVector, x: Features): Target = x * w override protected def predictProbaStateless(model: DummyLinearClassifier, w: RealVector, x: Features): Simplex = sigmoid(x * w).asDenseMatrix.t override protected[linear] def lossStateless(model: DummyLinearClassifier, w: RealVector, x: Features, y: Target): Float = 0.0f override protected[linear] def lossGradStateless(model: DummyLinearClassifier, w: RealVector, x: Features, y: Target): RealVector = w } private val x = DenseMatrix.rand[Float](10, 5, rand = randomUniform) private val y = DenseVector.vertcat(DenseVector.zeros[Float](5), DenseVector.ones[Float](5)) private val model = DummyLinearClassifier(none, none) "Linear classifier" should "throw an exception when using fit, predict on trained, untrained models" in { an [IllegalArgumentException] should be thrownBy ev.predict(model, x) val trainedModel = ev.fit(model, x, y) an [IllegalArgumentException] should be thrownBy ev.fit(trainedModel, x, y) } it should "implement predictor functions" in { ev.isFitted(model) shouldBe false val trainedModel = ev.fit(model, x, y) ev.isFitted(trainedModel) shouldBe true val yPred = ev.predict(trainedModel, x) yPred.length shouldEqual y.length } it should "set the number of classes after fit" in { ev.numClasses(model).isEmpty shouldBe true val trainedModel = ev.fit(model, x, y) ev.numClasses(trainedModel).value shouldBe 2 } it should "throw an exception if fitting a model with an invalid target variable" in { val invalidCategoricalY = DenseVector.zeros[Float](10) an [IllegalArgumentException] should be thrownBy ev.fit(model, x, invalidCategoricalY) val invalidRealY = DenseVector.rand[Float](10, rand = randomUniform) an [IllegalArgumentException] should be thrownBy ev.fit(model, x, invalidRealY) } }
Example 76
Source File: SoftmaxClassifierTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.linear import breeze.linalg.{DenseMatrix, DenseVector} import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.SoftmaxClassifier.ev import org.scalactic.{Equality, TolerantNumerics} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class SoftmaxClassifierTest extends AnyFlatSpec with Matchers with TestingUtils { implicit val tolerance: Equality[Float] = TolerantNumerics.tolerantFloatEquality(1e-3f) "Softmax classifier" should "calculate the value of the loss function" in { val w = DenseVector(1.0f, 2.0f, 3.0f, 4.0f, 5.0f, 1.0f) val x = DenseMatrix( List(3.0f, 1.0f, 2.0f), List(-1.0f, -2.0f, 2.0f), List(-2.0f, 1.0f, 0.0f) ) val y = DenseVector(1.0f, 0.0f, 2.0f) val model = ev.copy(SoftmaxClassifier(lambda = 1.0f), numClasses = 3) ev.lossStateless(model, w, x, y) shouldEqual 19.843778223530194f } it should "calculate the gradient of the loss function wrt. to model parameters" in { for (_ <- 1 to 1000) { val w = DenseVector.rand[Float](5 * 9, rand = randomUniform) val x = DenseMatrix.rand[Float](10, 5, rand = randomUniform) val y = DenseVector.rangeF(0, 10) testGrad(w, x, y) } def testGrad(w: RealVector, x: Features, y: Target) = { val model = ev.copy(SoftmaxClassifier(lambda = 0.5f), numClasses = 10) breezeEqual( gradApprox(w => ev.lossStateless(model, w, x, y), w), ev.lossGradStateless(model, w, x, y) ) shouldEqual true } } it should "prevent the usage of negative L2 regularization strength" in { an [IllegalArgumentException] shouldBe thrownBy(SoftmaxClassifier(lambda = -0.5f)) } }
Example 77
Source File: PoissonRegressionTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.linear import breeze.linalg.{DenseMatrix, DenseVector, convert} import breeze.stats.distributions.Rand import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.PoissonRegression.ev import org.scalactic.{Equality, TolerantNumerics} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class PoissonRegressionTest extends AnyFlatSpec with Matchers with TestingUtils { implicit val tolerance: Equality[Float] = TolerantNumerics.tolerantFloatEquality(1e-2f) "Poisson regression" should "calculate the value of the loss function" in { val w = DenseVector(1.0f, 2.0f, 3.0f) val x = DenseMatrix( List(3.0f, 1.0f, 2.0f), List(-1.0f, -2.0f, 2.0f) ) val y = DenseVector(3.0f, 4.0f) val model = PoissonRegression(lambda = 1.0f) ev.lossStateless(model, w, x, y) shouldEqual 29926.429998513137f } it should "calculate the gradient of the loss function wrt. to model parameters" in { for (_ <- 1 to 1000) { val w = DenseVector.rand[Float](5, rand = randomUniform) val x = DenseMatrix.rand[Float](10, 5, rand = randomUniform) val y = convert(DenseVector.rand(10, rand = Rand.randInt(20)), Float) testGrad(w, x, y) } def testGrad(w: RealVector, x: Features, y: Target) = { val model = PoissonRegression(lambda = 0.5f) breezeEqual( gradApprox(w => ev.lossStateless(model, w, x, y), w), ev.lossGradStateless(model, w, x, y) ) shouldEqual true } } it should "prevent the usage of negative L2 regularization strength" in { an [IllegalArgumentException] shouldBe thrownBy(PoissonRegression(lambda = -0.5f)) } it should "throw an exception if fitting a model on a dataset that is not count data" in { val x = DenseMatrix( List(3.0f, 1.0f, 2.0f), List(-1.0f, -2.0f, 2.0f), List(3.0f, 1.0f, 2.0f) ) val y = DenseVector.rand[Float](3, rand = randomUniform) val model = PoissonRegression() an [IllegalArgumentException] shouldBe thrownBy(ev.fit(model, x, y)) } }
Example 78
Source File: LinearRegressionTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.linear import breeze.linalg.{DenseMatrix, DenseVector} import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.LinearRegression.ev import org.scalactic.{Equality, TolerantNumerics} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class LinearRegressionTest extends AnyFlatSpec with Matchers with TestingUtils { implicit val tolerance: Equality[Float] = TolerantNumerics.tolerantFloatEquality(1e-3f) "Linear regression" should "calculate the value of the loss function" in { val w = DenseVector(1.0f, 2.0f, 3.0f) val x = DenseMatrix( List(3.0f, 1.0f, 2.0f), List(-1.0f, -2.0f, 2.0f) ) val y = DenseVector(3.0f, 4.0f) val model = LinearRegression(lambda = 1) ev.lossStateless(model, w, x, y) shouldEqual 24.75f } it should "calculate the gradient of the loss function wrt. to model parameters" in { for (_ <- 1 to 1000) { val w = DenseVector.rand[Float](5, rand = randomUniform) val x = DenseMatrix.rand[Float](10, 5, rand = randomUniform) val y = DenseVector.rand[Float](10, rand = randomUniform) testGrad(w, x, y) } def testGrad(w: RealVector, x: Features, y: Target) = { val model = LinearRegression(lambda = 0.5f) breezeEqual( gradApprox(w => ev.lossStateless(model, w, x, y), w), ev.lossGradStateless(model, w, x, y) ) shouldEqual true } } it should "prevent the usage of negative L2 regularization strength" in { an [IllegalArgumentException] shouldBe thrownBy(LinearRegression(lambda = -0.5f)) } }
Example 79
Source File: LogisticRegressionTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.linear import breeze.linalg.{DenseMatrix, DenseVector, convert} import breeze.numerics.round import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.LogisticRegression.ev import org.scalactic.{Equality, TolerantNumerics} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class LogisticRegressionTest extends AnyFlatSpec with Matchers with TestingUtils { implicit val tolerance: Equality[Float] = TolerantNumerics.tolerantFloatEquality(1e-3f) "Logistic regression" should "calculate the value of the loss function" in { val w = DenseVector(1.0f, 2.0f, 3.0f) val x = DenseMatrix( List(3.0f, 1.0f, 2.0f), List(-1.0f, -2.0f, 2.0f) ) val y = DenseVector(1.0f, 0.0f) val model = LogisticRegression(lambda = 1) ev.lossStateless(model, w, x, y) shouldEqual 7.1566391945397703f } it should "calculate the gradient of the loss function wrt. to model parameters" in { for (_ <- 1 to 1000) { val w = DenseVector.rand[Float](5, rand = randomUniform) val x = DenseMatrix.rand[Float](10, 5, rand = randomUniform) val y = convert(round(DenseVector.rand[Float](10, rand = randomUniform)), Float) testGrad(w, x, y) } def testGrad(w: RealVector, x: Features, y: Target) = { val model = LogisticRegression(lambda = 0.5f) breezeEqual( gradApprox(w => ev.lossStateless(model, w, x, y), w), ev.lossGradStateless(model, w, x, y) ) shouldEqual true } } it should "prevent the usage of negative L2 regularization strength" in { an [IllegalArgumentException] shouldBe thrownBy(LogisticRegression(lambda = -0.5f)) } it should "throw an exception if fitting a model on a dataset with more than two classes" in { val x = DenseMatrix( List(3.0f, 1.0f, 2.0f), List(-1.0f, -2.0f, 2.0f), List(3.0f, 1.0f, 2.0f) ) val y = DenseVector(1.0f, 0.0f, 2.0f) val model = LogisticRegression() an [IllegalArgumentException] shouldBe thrownBy(ev.fit(model, x, y)) } }
Example 80
Source File: LinearRegressorTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.linear import breeze.linalg.{DenseMatrix, DenseVector} import cats.syntax.option._ import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.linear.typeclasses.LinearRegressor import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers case class DummyLinearRegressor(w: Option[RealVector]) class LinearRegressorTest extends AnyFlatSpec with Matchers with TestingUtils { val ev: LinearRegressor[DummyLinearRegressor] = new LinearRegressor[DummyLinearRegressor] { override protected def w(model: DummyLinearRegressor): Option[RealVector] = model.w override protected def copy(model: DummyLinearRegressor): DummyLinearRegressor = model.copy() override protected def copy(model: DummyLinearRegressor, w: RealVector): DummyLinearRegressor = model.copy(w.some) override protected def targetVariableAppropriate(y: Target): Boolean = true override protected def predictStateless(model: DummyLinearRegressor, w: RealVector, x: Features): Target = x * w override protected[linear] def lossStateless(model: DummyLinearRegressor, w: RealVector, x: Features, y: Target): Float = 0.0f override protected[linear] def lossGradStateless(model: DummyLinearRegressor, w: RealVector, x: Features, y: Target): RealVector = w } private val x = DenseMatrix.rand[Float](10, 5, rand = randomUniform) private val y = DenseVector.rand[Float](10, rand = randomUniform) private val model = DummyLinearRegressor(none) "Linear regressor" should "throw an exception when using fit, predict on trained, untrained models" in { an [IllegalArgumentException] should be thrownBy ev.predict(model, x) val trainedModel = ev.fit(model, x, y) an [IllegalArgumentException] should be thrownBy ev.fit(trainedModel, x, y) } it should "implement predictor functions" in { ev.isFitted(model) shouldBe false val trainedModel = ev.fit(model, x, y) ev.isFitted(trainedModel) shouldBe true val yPred = ev.predict(trainedModel, x) yPred.length shouldEqual y.length } }
Example 81
Source File: StandardScalerTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.preprocessing import breeze.linalg.{*, DenseMatrix, DenseVector, convert} import breeze.stats.{mean, stddev} import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature} import io.picnicml.doddlemodel.preprocessing.StandardScaler.ev import org.scalactic.{Equality, TolerantNumerics} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class StandardScalerTest extends AnyFlatSpec with Matchers with TestingUtils { implicit val tolerance: Equality[Float] = TolerantNumerics.tolerantFloatEquality(1e-4f) "Standard scaler" should "preprocess the numerical features" in { val x = DenseMatrix.rand[Float](10, 5, rand = randomUniform) val featureIndex = FeatureIndex( List( NumericalFeature, NumericalFeature, NumericalFeature, NumericalFeature, CategoricalFeature ) ) val scaler = StandardScaler(featureIndex) val trainedScaler = ev.fit(scaler, x) val xTransformed = ev.transform(trainedScaler, x) breezeEqual(mean(x(::, *)).t, DenseVector.zeros[Float](5)) shouldBe false breezeEqual(convert(stddev(x(::, *)).t, Float), DenseVector.ones[Float](5)) shouldBe false val expectedMeans = DenseVector.zeros[Float](5) expectedMeans(-1) = mean(x(::, -1)) breezeEqual(mean(xTransformed(::, *)).t, expectedMeans) shouldBe true val expectedStdDevs = DenseVector.ones[Float](5) expectedStdDevs(-1) = stddev(x(::, -1)).toFloat breezeEqual(convert(stddev(xTransformed(::, *)).t, Float), expectedStdDevs) shouldBe true } it should "handle the zero variance case" in { val x = DenseMatrix.ones[Float](10, 5) val scaler = StandardScaler(FeatureIndex.numerical(5)) val trainedScaler = ev.fit(scaler, x) val xTransformed = ev.transform(trainedScaler, x) xTransformed.forall(_.isNaN) shouldBe false } it should "preprocess a subset of numerical features" in { val x = DenseMatrix.rand[Float](10, 5, rand = randomUniform) val scaler = StandardScaler(FeatureIndex.numerical(5).subset("f0", "f2", "f4")) val trainedScaler = ev.fit(scaler, x) val xTransformed = ev.transform(trainedScaler, x) breezeEqual(mean(x(::, *)).t, DenseVector.zeros[Float](5)) shouldBe false breezeEqual(convert(stddev(x(::, *)).t, Float), DenseVector.ones[Float](5)) shouldBe false assert(tolerance.areEqual(mean(xTransformed(::, 0)), 0.0f)) assert(tolerance.areEqual(convert(stddev(xTransformed(::, 0)), Float), 1.0f)) assert(!tolerance.areEqual(mean(xTransformed(::, 1)), 0.0f)) assert(!tolerance.areEqual(convert(stddev(xTransformed(::, 1)), Float), 1.0f)) assert(tolerance.areEqual(mean(xTransformed(::, 2)), 0.0f)) assert(tolerance.areEqual(convert(stddev(xTransformed(::, 2)), Float), 1.0f)) assert(!tolerance.areEqual(mean(xTransformed(::, 3)), 0.0f)) assert(!tolerance.areEqual(convert(stddev(xTransformed(::, 3)), Float), 1.0f)) assert(tolerance.areEqual(mean(xTransformed(::, 4)), 0.0f)) assert(tolerance.areEqual(convert(stddev(xTransformed(::, 4)), Float), 1.0f)) } }
Example 82
Source File: BinarizerTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.preprocessing import breeze.linalg.{DenseMatrix, DenseVector} import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature} import io.picnicml.doddlemodel.preprocessing.Binarizer.ev import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class BinarizerTest extends AnyFlatSpec with Matchers with TestingUtils { private val x = DenseMatrix( List(0.0f, 1.0f, 0.0f), List(0.3f, -1.0f, 1.0f), List(-0.3f, 2.0f, 0.0f) ) "Binarizer" should "process the numerical columns by corresponding thresholds" in { val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, CategoricalFeature)) val thresholds = DenseVector(0.0f, -1.5f) val binarizer = Binarizer(thresholds, featureIndex) val xBinarizedExpected = DenseMatrix( List(0.0f, 1.0f, 0.0f), List(1.0f, 1.0f, 1.0f), List(0.0f, 1.0f, 0.0f) ) breezeEqual(ev.transform(binarizer, x), xBinarizedExpected) shouldBe true } it should "process all the numerical columns by a single threshold" in { val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature)) val threshold = 0.5f val binarizer = Binarizer(threshold, featureIndex) val xBinarizedExpected = DenseMatrix( List(0.0f, 1.0f, 0.0f), List(0.0f, 0.0f, 1.0f), List(0.0f, 1.0f, 0.0f) ) breezeEqual(ev.transform(binarizer, x), xBinarizedExpected) shouldBe true } it should "amount to no-op if there are no numerical features in data" in { val featureIndex = FeatureIndex(List(CategoricalFeature, CategoricalFeature, CategoricalFeature)) val thresholds1 = DenseVector(0.0f, -1.5f) val thresholds2 = 0.5f val binarizer1 = Binarizer(thresholds1, featureIndex) val binarizer2 = Binarizer(thresholds2, featureIndex) breezeEqual(ev.transform(binarizer1, x), x) shouldBe true breezeEqual(ev.transform(binarizer2, x), x) shouldBe true } it should "fail when the amount of passed thresholds is different to number of numerical features in data" in { val featureIndex = FeatureIndex(List(NumericalFeature, NumericalFeature, NumericalFeature)) val thresholds = DenseVector(0.0f, -1.5f) // 3 numeric columns vs 2 thresholds an [IllegalArgumentException] should be thrownBy Binarizer(thresholds, featureIndex) } }
Example 83
Source File: NormsTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.preprocessing import breeze.linalg.{DenseMatrix, DenseVector} import io.picnicml.doddlemodel.TestingUtils import org.scalactic.{Equality, TolerantNumerics} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class NormsTest extends AnyFlatSpec with Matchers with TestingUtils { implicit val tolerance: Equality[Float] = TolerantNumerics.tolerantFloatEquality(1e-4f) private val x = DenseMatrix( List(0.0f, 0.0f, 0.0f), List(1.0f, 2.0f, 2.0f), List(-2.0f, 0.0f, 0.0f) ) "Norms" should "calculate the L2 norm of each row" in { val xExpected = DenseVector(0.0f, 3.0f, 2.0f) breezeEqual(Norms.L2Norm(x), xExpected) shouldBe true } "Norms" should "calculate the L1 norm of each row" in { val xExpected = DenseVector(0.0f, 5.0f, 2.0f) breezeEqual(Norms.L1Norm(x), xExpected) shouldBe true } "Norms" should "calculate the max norm of each row" in { val xExpected = DenseVector(0.0f, 2.0f, 2.0f) breezeEqual(Norms.MaxNorm(x), xExpected) shouldBe true } }
Example 84
Source File: StratifiedClassifierTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.dummy.classification import breeze.linalg.{DenseVector, convert} import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.{loadBreastCancerDataset, loadIrisDataset} import io.picnicml.doddlemodel.dummy.classification.StratifiedClassifier.ev import org.scalactic.{Equality, TolerantNumerics} import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class StratifiedClassifierTest extends AnyFlatSpec with Matchers with TestingUtils { implicit val tolerance: Equality[Float] = TolerantNumerics.tolerantFloatEquality(1e-3f) "Stratified classifier" should "infer a categorical distribution from the iris dataset" in { val (x, y, _) = loadIrisDataset val model = StratifiedClassifier() val trainedModel = ev.fit(model, x, y) breezeEqual( convert(trainedModel.getTargetDistributionParams, Float), DenseVector(0.333f, 0.333f, 0.333f) ) shouldBe true } it should "infer a categorical distribution from the breast cancer dataset" in { val (x, y, _) = loadBreastCancerDataset val model = StratifiedClassifier() val trainedModel = ev.fit(model, x, y) breezeEqual( convert(trainedModel.getTargetDistributionParams, Float), DenseVector(0.372f, 0.627f) ) shouldBe true } }
Example 85
Source File: LocalFileLR.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import breeze.linalg.{Vector, DenseVector} object LocalFileLR { val D = 10 // Numer of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use either org.apache.spark.mllib.classification.LogisticRegressionWithSGD or |org.apache.spark.mllib.classification.LogisticRegressionWithLBFGS |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D){2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } }
Example 86
Source File: MostFrequentValueImputerTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.impute import breeze.linalg.{DenseMatrix, DenseVector} import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature} import io.picnicml.doddlemodel.impute.MostFrequentValueImputer.ev import org.scalatest.OptionValues import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class MostFrequentValueImputerTest extends AnyFlatSpec with Matchers with TestingUtils with OptionValues { "Most frequent value imputer" should "impute the categorical features" in { val xMissing = DenseMatrix( List(Float.NaN, 1.0f, 2.0f), List(3.0f, Float.NaN, 5.0f), List(6.0f, 7.0f, 8.0f), List(6.0f, 7.0f, 2.0f) ) val xImputedExpected = DenseMatrix( List(6.0f, 1.0f, 2.0f), List(3.0f, Float.NaN, 5.0f), List(6.0f, 7.0f, 8.0f), List(6.0f, 7.0f, 2.0f) ) val featureIndex = FeatureIndex.apply(List(CategoricalFeature, NumericalFeature, CategoricalFeature)) val imputer = MostFrequentValueImputer(featureIndex) val fittedImputer = ev.fit(imputer, xMissing) breezeEqual(fittedImputer.mostFrequent.value, DenseVector(6.0f, 2.0f)) shouldBe true breezeEqual(ev.transform(fittedImputer, xMissing), xImputedExpected) shouldBe true } it should "impute a subset of categorical features" in { val xMissing = DenseMatrix( List(Float.NaN, 1.0f, 2.0f), List(3.0f, Float.NaN, 5.0f), List(6.0f, 7.0f, 8.0f), List(6.0f, 7.0f, 2.0f) ) val xImputedExpected = DenseMatrix( List(Float.NaN, 1.0f, 2.0f), List(3.0f, 7.0f, 5.0f), List(6.0f, 7.0f, 8.0f), List(6.0f, 7.0f, 2.0f) ) val featureIndex = FeatureIndex.categorical(List(1, 2)) val imputer = MostFrequentValueImputer(featureIndex) val fittedImputer = ev.fit(imputer, xMissing) breezeEqual(fittedImputer.mostFrequent.value, DenseVector(7.0f, 2.0f)) shouldBe true breezeEqual(ev.transform(fittedImputer, xMissing), xImputedExpected) shouldBe true } }
Example 87
Source File: MeanValueImputerTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.impute import breeze.linalg.{DenseMatrix, DenseVector} import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, FeatureIndex, NumericalFeature} import io.picnicml.doddlemodel.impute.MeanValueImputer.ev import org.scalatest.OptionValues import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class MeanValueImputerTest extends AnyFlatSpec with Matchers with TestingUtils with OptionValues { "Mean value imputer" should "impute the numerical features" in { val xMissing = DenseMatrix( List(Float.NaN, 1.0f, 2.0f), List(3.0f, Float.NaN, 5.0f), List(6.0f, 7.0f, 8.0f) ) val xImputedExpected = DenseMatrix( List(4.5f, 1.0f, 2.0f), List(3.0f, Float.NaN, 5.0f), List(6.0f, 7.0f, 8.0f) ) val imputer = MeanValueImputer(FeatureIndex.apply(List(NumericalFeature, CategoricalFeature, NumericalFeature))) val fittedImputer = ev.fit(imputer, xMissing) breezeEqual(fittedImputer.means.value, DenseVector(4.5f, 5.0f)) shouldBe true breezeEqual(ev.transform(fittedImputer, xMissing), xImputedExpected) shouldBe true } it should "impute a subset of numerical features" in { val xMissing = DenseMatrix( List(Float.NaN, 1.0f, 2.0f), List(3.0f, Float.NaN, 5.0f), List(6.0f, 7.0f, 8.0f) ) val xImputedExpected = DenseMatrix( List(4.5f, 1.0f, 2.0f), List(3.0f, Float.NaN, 5.0f), List(6.0f, 7.0f, 8.0f) ) val imputer = MeanValueImputer(FeatureIndex.numerical(List(0, 2))) val fittedImputer = ev.fit(imputer, xMissing) breezeEqual(fittedImputer.means.value, DenseVector(4.5f, 5.0f)) shouldBe true breezeEqual(ev.transform(fittedImputer, xMissing), xImputedExpected) shouldBe true } }
Example 88
Source File: DatasetUtilsTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.data import breeze.linalg.DenseVector import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.DatasetUtils.{shuffleDataset, splitDataset, splitDatasetWithGroups} import org.scalactic.{Equality, TolerantNumerics} import scala.util.Random import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class DatasetUtilsTest extends AnyFlatSpec with Matchers with TestingUtils { implicit val rand: Random = new Random(0) implicit val tolerance: Equality[Float] = TolerantNumerics.tolerantFloatEquality(1.0f) val (x, y, _) = loadIrisDataset "Dataset utils" should "shuffle the dataset" in { val (_, yShuffled) = shuffleDataset(x, y) breezeEqual(y, yShuffled) shouldBe false } they should "split the dataset" in { val split = splitDataset(x, y) split.yTr.length shouldBe 75 split.yTe.length shouldBe 75 } they should "split the dataset with groups" in { val groups = DenseVector((0 until x.rows).map(x => x % 4):_*) val split = splitDatasetWithGroups(x, y, groups, proportionTrain = 0.8f) val groupsTe = split.groupsTe.toArray split.groupsTr.forall(trGroup => !groupsTe.contains(trGroup)) shouldBe true } }
Example 89
Source File: CsvLoaderTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.data import breeze.linalg.{DenseMatrix, DenseVector} import io.picnicml.doddlemodel.TestingUtils import io.picnicml.doddlemodel.data.Feature.{CategoricalFeature, NumericalFeature} import io.picnicml.doddlemodel.data.ResourceDatasetLoaders.loadDummyCsvReadingDataset import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class CsvLoaderTest extends AnyFlatSpec with Matchers with TestingUtils { "Csv loader" should "load and encode data" in { val (x, y, featureIndex) = loadDummyCsvReadingDataset val xCorrect = DenseMatrix( List(0.0f, 0.0f, 0.1f, 1.1f), List(1.0f, Float.NaN, 0.2f, 1.2f), List(2.0f, 1.0f, 0.3f, Float.NaN), List(3.0f, 2.0f, 0.4f, 1.4f), List(0.0f, 0.0f, 0.1f, 1.1f), List(3.0f, Float.NaN, 0.4f, 1.4f) ) val yCorrect = DenseVector(0.0f, 1.0f, 2.0f, 3.0f, 0.0f, 3.0f) breezeEqual(x, xCorrect) shouldBe true breezeEqual(y, yCorrect) shouldBe true featureIndex.names shouldBe IndexedSeq("f0", "f1", "f2", "f3") featureIndex.types shouldBe IndexedSeq( CategoricalFeature, CategoricalFeature, NumericalFeature, NumericalFeature ) featureIndex.columnIndices shouldBe (0 until 4) } }
Example 90
Source File: ClassificationMetricsTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.metrics import breeze.linalg.DenseVector import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class ClassificationMetricsTest extends AnyFlatSpec with Matchers { "Classification metrics" should "calculate the classification accuracy value" in { val y = DenseVector(1.0f, 0.0f, 0.0f, 0.0f, 1.0f) val yPred = DenseVector(1.0f, 1.0f, 0.0f, 1.0f, 0.0f) accuracy(y, yPred) shouldEqual 0.4f } they should "calculate the precision value" in { val y = DenseVector(1.0f, 0.0f, 0.0f, 0.0f, 1.0f) val yPred = DenseVector(1.0f, 1.0f, 0.0f, 1.0f, 0.0f) precision(y, yPred) shouldBe 0.3333333333333333f } they should "calculate the recall value" in { val y = DenseVector(1.0f, 0.0f, 0.0f, 0.0f, 1.0f) val yPred = DenseVector(1.0f, 1.0f, 0.0f, 1.0f, 0.0f) recall(y, yPred) shouldBe 0.5f } they should "calculate the F1 score value" in { val y = DenseVector(1.0f, 0.0f, 0.0f, 0.0f, 1.0f) val yPred = DenseVector(1.0f, 1.0f, 0.0f, 1.0f, 0.0f) f1Score(y, yPred) shouldBe 0.4f } they should "calculate the Hamming loss value" in { val y = DenseVector(1.0f, 0.0f, 0.0f, 0.0f, 1.0f) val yPred = DenseVector(1.0f, 1.0f, 0.0f, 1.0f, 0.0f) // 3 out of 5 miss-classifications hammingLoss(y, yPred) shouldBe 0.6f } }
Example 91
Source File: RankingMetricsTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.metrics import breeze.linalg.DenseVector import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class RankingMetricsTest extends AnyFlatSpec with Matchers { "Ranking metrics" should "calculate the AUC value" in { val y = DenseVector(1.0f, 1.0f, 0.0f, 0.0f, 1.0f, 0.0f, 1.0f, 0.0f, 1.0f, 1.0f, 0.0f) val yPredProba = DenseVector( 0.6346f, 0.0742f, 0.4324f, 0.9911f, 0.7245f, 0.4751f, 0.5112f, 0.0311f, 0.7641f, 0.6612f, 0.0134f ) val aucScore = auc(y, yPredProba) aucScore shouldBe 0.733333333333333f +- 1e-15f } }
Example 92
Source File: RegressionMetricsTest.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.metrics import breeze.linalg.DenseVector import org.scalatest.flatspec.AnyFlatSpec import org.scalatest.matchers.should.Matchers class RegressionMetricsTest extends AnyFlatSpec with Matchers { "Regression metrics" should "calculate the rmse value" in { val y = DenseVector(1.0f, 4.1f, 2.2f, 5.1f, 9.6f) val yPred = DenseVector(1.2f, 1.4f, 8.2f, 3.1f, 9.6f) rmse(y, yPred) shouldEqual 3.076686529368892f } they should "calculate the mse value" in { val y = DenseVector(1.0f, 4.1f, 2.2f, 5.1f, 9.6f) val yPred = DenseVector(1.2f, 1.4f, 8.2f, 3.1f, 9.6f) mse(y, yPred) shouldEqual 9.466f +- 0.001f } they should "calculate the mae value" in { val y = DenseVector(1.0f, 4.1f, 2.2f, 5.1f, 9.6f) val yPred = DenseVector(1.2f, 1.9f, 2.8f, 4.1f, 10.6f) assert(mae(y, yPred) === 1.0f +- 0.0000001f) } they should "calculate the explained variance score" in { val y = DenseVector(1.0f, 4.1f, 2.2f, 5.1f, 9.6f) val yPred = DenseVector(2.2f, 2.9f, 0.0f, 6.1f, 10.8f) assert(explainedVariance(y, yPred) === 0.769195820081781f +- 0.0000001f) } }
Example 93
Source File: LinearModel.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.linear.typeclasses import breeze.linalg.{DenseMatrix, DenseVector} import breeze.optimize.{DiffFunction, LBFGS} import io.picnicml.doddlemodel.data.{Features, RealVector, Target} import io.picnicml.doddlemodel.typeclasses.Predictor trait LinearModel[A] { this: Predictor[A] => protected[linear] def lossGradStateless(model: A, w: RealVector, x: Features, y: Target): RealVector override def isFitted(model: A): Boolean = w(model).isDefined override def predictSafe(model: A, x: Features): Target = predictStateless(model, w(model).get, xWithBiasTerm(x)) protected def maximumLikelihood(model: A, x: Features, y: Target, init: RealVector): RealVector = { val diffFunction = new DiffFunction[RealVector] { override def calculate(w: RealVector): (Double, RealVector) = (lossStateless(model, w, x, y).toDouble, lossGradStateless(model, w, x, y)) } val lbfgs = new LBFGS[DenseVector[Float]](tolerance = 1e-4) lbfgs.minimize(diffFunction, init) } protected def xWithBiasTerm(x: Features): Features = DenseMatrix.horzcat(DenseMatrix.ones[Float](x.rows, 1), x) }
Example 94
Source File: LinearClassifier.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.linear.typeclasses import breeze.linalg.DenseVector import io.picnicml.doddlemodel.data.{Features, RealVector, Simplex, Target} import io.picnicml.doddlemodel.syntax.OptionSyntax._ import io.picnicml.doddlemodel.typeclasses.Classifier trait LinearClassifier[A] extends LinearModel[A] with Classifier[A] { protected def predictProbaStateless(model: A, w: RealVector, x: Features): Simplex override protected def fitSafe(model: A, x: Features, y: Target): A = { val wLength = (x.cols + 1) * (numClasses(model).getOrBreak - 1) val wInitial = DenseVector.zeros[Float](wLength) copy(model, w = maximumLikelihood(model, xWithBiasTerm(x), y, wInitial)) } override protected def predictProbaSafe(model: A, x: Features): Simplex = predictProbaStateless(model, w(model).get, xWithBiasTerm(x)) }
Example 95
Source File: Binarizer.scala From doddle-model with Apache License 2.0 | 5 votes |
package io.picnicml.doddlemodel.preprocessing import breeze.linalg.DenseVector import io.picnicml.doddlemodel.data.Feature.FeatureIndex import io.picnicml.doddlemodel.data.{Features, RealVector} import io.picnicml.doddlemodel.typeclasses.Transformer case class Binarizer(private val thresholds: RealVector, private val featureIndex: FeatureIndex) { private val numNumeric = featureIndex.numerical.columnIndices.length require(numNumeric == 0 || numNumeric == thresholds.length, "A threshold should be given for every numerical column") } def apply(threshold: Float, featureIndex: FeatureIndex): Binarizer = { val numNumeric: Int = featureIndex.numerical.columnIndices.length val thresholdsExtended = DenseVector.fill(numNumeric) { threshold } Binarizer(thresholdsExtended, featureIndex) } @SerialVersionUID(0L) implicit lazy val ev: Transformer[Binarizer] = new Transformer[Binarizer] { override def isFitted(model: Binarizer): Boolean = true override def fit(model: Binarizer, x: Features): Binarizer = model override protected def transformSafe(model: Binarizer, x: Features): Features = { val xCopy = x.copy model.featureIndex.numerical.columnIndices.zipWithIndex.foreach { case (colIndex, thresholdIndex) => (0 until xCopy.rows).foreach { rowIndex => xCopy(rowIndex, colIndex) = if (xCopy(rowIndex, colIndex) > model.thresholds(thresholdIndex)) 1.0f else 0.0f } } xCopy } } }
Example 96
Source File: GibbsSample.scala From glintlda with MIT License | 5 votes |
package glintlda import breeze.linalg.{DenseVector, SparseVector, sum} import glintlda.util.FastRNG def apply(sv: SparseVector[Int], random: FastRNG, topics: Int): GibbsSample = { val totalTokens = sum(sv) val sample = new GibbsSample(new Array[Int](totalTokens), new Array[Int](totalTokens)) var i = 0 var current = 0 while (i < sv.activeSize) { val index = sv.indexAt(i) var value = sv.valueAt(i) while (value > 0) { sample.features(current) = index sample.topics(current) = random.nextPositiveInt() % topics current += 1 value -= 1 } i += 1 } sample } }
Example 97
Source File: Sampler.scala From glintlda with MIT License | 5 votes |
package glintlda.naive import breeze.linalg.{DenseVector, Vector} import breeze.stats.distributions.Multinomial import glintlda.LDAConfig import glintlda.util.FastRNG def sampleFeature(feature: Int, oldTopic: Int): Int = { var i = 0 val p = DenseVector.zeros[Double](config.topics) var sum = 0.0 while (i < config.topics) { p(i) = (documentCounts(i) + α) * ((wordCounts(i) + β) / (globalCounts(i) + βSum)) sum += p(i) i += 1 } p /= sum Multinomial(p).draw() } }
Example 98
Source File: SparkHdfsLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 99
Source File: LocalLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 100
Source File: SparkKMeans.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{squaredDistance, DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkKMeans") .getOrCreate() val lines = spark.read.textFile(args(0)).rdd val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) spark.stop() } } // scalastyle:on println
Example 101
Source File: LocalFileLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 102
Source File: SparkLR.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 103
Source File: LocalKMeans.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 104
Source File: SphericalHarmonicsSolver.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.deluminate import breeze.linalg.{DenseMatrix, DenseVector} import scalismo.color.RGB import scalismo.faces.numerics.SphericalHarmonics import scalismo.geometry.{EuclideanVector, _3D} def solveSHSystemDeconvolve(points: IndexedSeq[IlluminatedPoint], kernel: IndexedSeq[Double]): IndexedSeq[EuclideanVector[_3D]] = { require(points.nonEmpty) // direct access data val radiances = points.map(_.radiance) val normals = points.map(_.normal) val albedi = points.map(_.albedo) // prepare SH basis val nSH = kernel.length val shBasis = IndexedSeq.tabulate(nSH)(i => SphericalHarmonics.shBasisFunction(i)) // build target vector on rhs: b (3*#points x 1), vectorize all colors to r, g, b val b = DenseVector(radiances.toArray.flatMap(r => r.toVector.toArray)) // build matrix: (3*#points) x (3*#lightCoefficients) def matrixBuilder(i: Int, j: Int): Double = { // major indices: point, light coefficient val pointIndex = i / 3 val shCoeffIndex = j / 3 // minor indices: color index R, G, B val pointColorIndex = i % 3 val shColorIndex = j % 3 // matrix element: albedo[point, color] * Y[shCoeff](normal[point]) * kernel(shCoeff) * delta(pointColor, shColor) if (pointColorIndex == shColorIndex) albedi(pointIndex).toVector.toArray(pointColorIndex) * shBasis(shCoeffIndex)(normals(pointIndex)) * kernel(shCoeffIndex) else 0.0 } val A: DenseMatrix[Double] = DenseMatrix.tabulate(3 * points.length, 3 * nSH)(matrixBuilder) // solve linear system val lightField: DenseVector[Double] = A \ b // extract channeled coefficients val shCoeffs: IndexedSeq[EuclideanVector[_3D]] = lightField.toArray.grouped(3).map(a => EuclideanVector[_3D](a)).toIndexedSeq // finished shCoeffs } }
Example 105
Source File: ColorTransform.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.render import breeze.linalg.{DenseMatrix, DenseVector, inv} import scalismo.color.RGB import scalismo.color.ColorSpaceOperations.implicits._ import scalismo.geometry.{SquareMatrix, _3D} def invert: ColorTransform = new ColorTransform { private val c = colorContrast private val A = DenseMatrix( (c * (1 - 0.3) + 0.3, 0.59 - 0.59 * c, 0.11 - 0.11 * c), (0.3 - 0.3 * c, c * (1 - 0.59) + 0.59, 0.11 - 0.11 * c), (0.3 - 0.3 * c, 0.59 - 0.59 * c, c * (1 - 0.11) + 0.11)) private val Ainv = inv(A) override def apply(color: RGB): RGB = { val mixed = (color - offset) / gain val b: DenseVector[Double] = Ainv * DenseVector[Double](mixed.r, mixed.g, mixed.b) RGB(b(0), b(1), b(2)) } def invert: ColorTransformWithColorContrast = self } }
Example 106
Source File: MoMoRenderer.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.sampling.face import breeze.linalg.DenseVector import scalismo.color.RGBA import scalismo.faces.image.PixelImage import scalismo.faces.landmarks.TLMSLandmark2D import scalismo.mesh.VertexColorMesh3D import scalismo.faces.momo.{MoMo, MoMoCoefficients} import scalismo.faces.parameters.{MoMoInstance, ParametricRenderer, RenderParameter} import scalismo.geometry.Point import scalismo.mesh.MeshSurfaceProperty import scalismo.utils.Memoize def cached(cacheSize: Int) = new MoMoRenderer(model, clearColor) { private val imageRenderer = Memoize(super.renderImage, cacheSize) private val meshRenderer = Memoize(super.renderMesh, cacheSize) private val maskRenderer = Memoize((super.renderMask _).tupled, cacheSize) private val lmRenderer = Memoize((super.renderLandmark _).tupled, cacheSize * allLandmarkIds.length) private val instancer = Memoize(super.instanceFromCoefficients _, cacheSize) override def renderImage(parameters: RenderParameter): PixelImage[RGBA] = imageRenderer(parameters) override def renderLandmark(lmId: String, parameter: RenderParameter): Option[TLMSLandmark2D] = lmRenderer((lmId, parameter)) override def renderMesh(parameters: RenderParameter): VertexColorMesh3D = meshRenderer(parameters) override def instance(parameters: RenderParameter): VertexColorMesh3D = instancer(parameters.momo) override def renderMask(parameters: RenderParameter, mask: MeshSurfaceProperty[Int]): PixelImage[Int] = maskRenderer((parameters, mask)) } } object MoMoRenderer { def apply(model: MoMo, clearColor: RGBA) = new MoMoRenderer(model, clearColor) def apply(model: MoMo) = new MoMoRenderer(model, RGBA.BlackTransparent) }
Example 107
Source File: DenseCholesky.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.numerics import breeze.linalg.{DenseMatrix, DenseVector} def substitutionSolver(choleskyFactor: DenseMatrix[Double], b: DenseVector[Double]): DenseVector[Double] = { require(choleskyFactor.rows == b.length, "dimensions disagree") require(choleskyFactor.rows == choleskyFactor.cols, "L must be square") val L = choleskyFactor val n = L.rows // solve L Lt x = b // 1) solve Ly = b val y = DenseVector.zeros[Double](n) // for each row substitute var row = 0 while (row < n) { // previous elements val sum = L(row, 0 until row) * y(0 until row) // divide by diagonal element y(row) = (b(row) - sum) / L(row, row) row += 1 } // 2) solve Lt x = y val x = DenseVector.zeros[Double](n) row = n - 1 // bwd substitution from right to left while (row >= 0) { // previous elements val sum = L.t(row, row + 1 until n) * x(row + 1 until n) // divie by diagonal element x(row) = (y(row) - sum) / L(row, row) row -= 1 } // x contains result x } }
Example 108
Source File: SphericalHarmonicsLightTests.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.parameters import breeze.linalg.DenseVector import scalismo.faces.FacesTestSuite import scalismo.color.RGB import scalismo.geometry.{EuclideanVector, EuclideanVector3D, _3D} class SphericalHarmonicsLightTests extends FacesTestSuite { describe("SphericalHarmonicsLight parameters") { it("can be constructed with specific number of bands") { val sh5 = SphericalHarmonicsLight.zero(5) sh5.bands shouldBe 5 sh5.coefficients.length shouldBe SphericalHarmonicsLight.coefficientsInBands(5) } it("can be rescaled to more components") { val sh = SphericalHarmonicsLight.frontal assert(sh.bands == 1) val band2 = sh.withNumberOfBands(2) band2.coefficients.length shouldBe 9 band2.bands shouldBe 2 band2.coefficients.take(4) shouldBe sh.coefficients band2.coefficients.drop(4) shouldBe IndexedSeq.fill(5)(EuclideanVector3D.zero) } it("can be rescaled to fewer components") { val sh = SphericalHarmonicsLight.frontal assert(sh.bands == 1) val band0 = sh.withNumberOfBands(0) band0.coefficients.length shouldBe 1 band0.bands shouldBe 0 band0.coefficients(0) shouldBe sh.coefficients(0) } it("SH to DenseVector and from DenseVector"){ val sh = SphericalHarmonicsLight.frontal val shB = sh.toBreezeVector val shN = SphericalHarmonicsLight.fromBreezeVector(shB) sh shouldBe shN val bV = DenseVector(Array.fill(27)(rnd.scalaRandom.nextGaussian())) val shL = SphericalHarmonicsLight.fromBreezeVector(bV) val nV = shL.toBreezeVector bV.toArray should contain theSameElementsInOrderAs nV.toArray } it("avoids building of invalid coefficients") { val wrongLength = IndexedSeq.fill(2)(EuclideanVector3D.zero) an [IllegalArgumentException] should be thrownBy SphericalHarmonicsLight(wrongLength) } describe("To recover a principal direction of illumination, SphericalHarmonicsLight") { it("extracts consistently the direction from randomly generated directed spherical harmonics.") { def testDirectionFromSH(eps: Double, repeat: Int): Boolean = { def genDirLight(v: EuclideanVector[_3D]) = SphericalHarmonicsLight.fromAmbientDiffuse(RGB(rnd.scalaRandom.nextDouble(), rnd.scalaRandom.nextDouble(), rnd.scalaRandom.nextDouble()), RGB(rnd.scalaRandom.nextDouble(), rnd.scalaRandom.nextDouble(), rnd.scalaRandom.nextDouble()), v) def randDirection = EuclideanVector.fromSpherical(1.0, rnd.scalaRandom.nextDouble() * math.Pi, rnd.scalaRandom.nextDouble() * math.Pi * 2.0)//non-uniform on the sphere! Does not matter for the test. (0 until repeat).forall { _ => val v = randDirection.normalize val rec = SphericalHarmonicsLight.directionFromSHLightIntensity(genDirLight(v)) if(rec.isDefined) { // if we find a direction val d = (v - rec.get).norm d < eps }else // if no direction is found. Should always find a direction in this test. false } } testDirectionFromSH(1e-14, 50) shouldBe true } it("gives sensible results for undirected SHL.") { val rec = SphericalHarmonicsLight.directionFromSHLightIntensity(SphericalHarmonicsLight.ambientWhite) rec.isDefined shouldBe false } } } }
Example 109
Source File: ConjugateGradientTests.scala From scalismo-faces with Apache License 2.0 | 5 votes |
package scalismo.faces.numerics import breeze.linalg.{CSCMatrix, DenseVector, norm} import scalismo.faces.FacesTestSuite class ConjugateGradientTests extends FacesTestSuite { val n = 50 val tol = 1e-10 val A = randomDiagDomBandMatrix(n, 10) val Ad = A.toDense val b = DenseVector.rand[Double](n) def randomDiagDomMatrix(n: Int, elementsPerColumn: Int): CSCMatrix[Double] = { val builder = new CSCMatrix.Builder[Double](n, n) // generate band for (col <- 0 until n; j <- 0 until elementsPerColumn) { val e = rnd.scalaRandom.nextDouble() val row = rnd.scalaRandom.nextInt(n) builder.add(row, col, e) builder.add(col, row, e) } // diagonally dominant for (i <- 0 until n) { builder.add(i, i, 2 * elementsPerColumn) } builder.result() } describe("ConjugateGradient solver") { it("can solve a random sparse linear system to high accuracy") { val x = ConjugateGradient.solveSparse(A, b, tol / 10) norm(b - A * x) should be < tol } } describe("PreconditionedConjugateGradient solver") { it("can solve a random sparse linear system to high accuracy with the incomplete Cholesky preconditioner") { val M = PreconditionedConjugateGradient.incompleteCholeskyPreconditioner(A) val x = PreconditionedConjugateGradient.solveSparse(A, b, M, tol / 10) norm(b - A * x) should be < tol } } }
Example 110
Source File: SparkKMeans.scala From AI with Apache License 2.0 | 5 votes |
// scalastyle:off println package com.bigchange.basic import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.{SparkConf, SparkContext} object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- centers.indices) { // 最近距离计算 val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val sparkConf = new SparkConf().setAppName("SparkKMeans").setMaster("local") val sc = new SparkContext(sparkConf) val lines = sc.textFile(args(0)) val data = lines.map(parseVector).cache() // inital K 值 val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) sc.stop() } } // scalastyle:on println
Example 111
Source File: StreamingModelProducer.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.streaming import java.io.PrintWriter import java.net.ServerSocket import breeze.linalg.DenseVector import scala.util.Random object StreamingModelProducer { def main(args: Array[String]) { val maxEvent = 100 val numFeatures = 100 val random = new Random() // 生成服从正太分布的稠密向量函数 def generateRandomArray(n: Int) = Array.tabulate(n)(_ => random.nextGaussian()) // 一个确定的随机模型权重向量 val w = new DenseVector(generateRandomArray(numFeatures)) val intercept = random.nextGaussian() * 10 // 生成一些随机数据事件 def generateNoisyData(n:Int) = { (1 to n).map { i => val x = new DenseVector(generateRandomArray(numFeatures)) // 随机特征向量 val y = w.dot(x) val noisy = y + intercept // 目标值 (noisy, x) } } // 创建网络生成器 val listener = new ServerSocket(9999) println("listener port:" + listener.getLocalPort) while(true) { val socket = listener.accept() new Thread() { override def run() = { println("get client from:" + socket.getInetAddress) val out = new PrintWriter(socket.getOutputStream, true) while (true) { Thread.sleep(1000) val num = random.nextInt(maxEvent) val productEvents = generateNoisyData(num) productEvents.foreach { case(y, x) => out.write(y + "\t" + x.data.mkString(",")) out.write("\n") } out.flush() println(s"created $num events") } socket.close() } }.start() } } }
Example 112
Source File: StreamingSimpleModel.scala From AI with Apache License 2.0 | 5 votes |
package com.bigchange.streaming import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.{LabeledPoint, StreamingLinearRegressionWithSGD} import org.apache.spark.streaming.{Seconds, StreamingContext} object StreamingSimpleModel { def main(args: Array[String]) { val ssc = new StreamingContext("local","test",Seconds(10)) val stream = ssc.socketTextStream("localhost",9999) val numberFeatures = 100 val zeroVector = DenseVector.zeros[Double](numberFeatures) val model = new StreamingLinearRegressionWithSGD() .setInitialWeights(Vectors.dense(zeroVector.data)) .setNumIterations(1) .setStepSize(0.01) val labeledStream = stream.map { event => val split = event.split("\t") val y = split(0).toDouble val features = split(1).split(",").map(_.toDouble) LabeledPoint(label = y, features = Vectors.dense(features)) } model.trainOn(labeledStream) // 使用DStream的转换算子 val predictAndTrue = labeledStream.transform { rdd => val latestModel = model.latestModel() rdd.map { point => val predict = latestModel.predict(point.features) predict - point.label } } // 计算MSE predictAndTrue.foreachRDD { rdd => val mse = rdd.map(x => x * x).mean() val rmse = math.sqrt(mse) println(s"current batch, MSE: $mse, RMSE:$rmse") } ssc.start() ssc.awaitTermination() } }
Example 113
Source File: MathUtil.scala From dbpedia-spotlight-model with Apache License 2.0 | 5 votes |
package org.dbpedia.spotlight.util import breeze.linalg.{DenseVector, Transpose} import breeze.numerics.sqrt import org.apache.commons.math.util.FastMath object MathUtil { val LOGZERO = Double.NegativeInfinity def isLogZero(x: Double): Boolean = x.isNegInfinity def exp(x: Double): Double = { if (x.isNegInfinity) 0.0 else FastMath.exp(x) } def ln(x: Double): Double = { if(x == 0.0) LOGZERO else FastMath.log(x) } def lnsum(a: Double, b: Double): Double = { if(a.isNegInfinity || b.isNegInfinity) { if(a.isNegInfinity) b else a } else { if(a > b) a + ln(1 + FastMath.exp(b-a)) else b + ln(1 + FastMath.exp(a-b)) } } def lnsum(seq: TraversableOnce[Double]): Double = { seq.foldLeft(MathUtil.ln(0.0))(MathUtil.lnsum) } def lnproduct(seq: TraversableOnce[Double]): Double = { seq.foldLeft(MathUtil.ln(1.0))(MathUtil.lnproduct) } def lnproduct(a: Double, b: Double): Double = { if (a.isNegInfinity || b.isNegInfinity) LOGZERO else a + b } def magnitude(vector: Transpose[DenseVector[Double]]): Double = { sqrt(vector * vector.t) } def magnitude(vector: Transpose[DenseVector[Float]]): Float = { sqrt(vector * vector.t) } def cosineSimilarity(vector1: Transpose[DenseVector[Double]], vector2: Transpose[DenseVector[Double]]): Double = { (vector1 * vector2.t) / (magnitude(vector1) * magnitude(vector2)) } def cosineSimilarity(vector1: Transpose[DenseVector[Float]], vector2: Transpose[DenseVector[Float]]): Float = { (vector1 * vector2.t) / (magnitude(vector1) * magnitude(vector2)) } }
Example 114
Source File: MathUtilTest.scala From dbpedia-spotlight-model with Apache License 2.0 | 5 votes |
package org.dbpedia.spotlight.util import breeze.linalg.DenseVector import org.dbpedia.spotlight.util.MathUtil.{cosineSimilarity, magnitude} import org.junit.Test class MathUtilTest { @Test def testMagnitudeDouble { val doubleExamples = Map( DenseVector.zeros[Double](5).t -> 0.0, DenseVector.ones[Double](5).t -> 2.23606797749979, DenseVector.ones[Double](10).t -> 3.1622776601683795 ) doubleExamples.keys.foreach( vector => { val m = magnitude(vector) printf("%-30s=%30s \n",doubleExamples(vector),m) assert(m.equals(doubleExamples(vector))) }) } @Test def testMagnitudeFloat{ val floatExamples = Map( DenseVector.zeros[Float](5).t -> 0.0.toFloat, DenseVector.ones[Float](5).t -> 2.23606797749979.toFloat, DenseVector.ones[Float](10).t -> 3.1622776601683795.toFloat ) floatExamples.keys.foreach( vector => { val m = magnitude(vector) printf("%-30s=%30s \n",floatExamples(vector),m) assert(m.equals(floatExamples(vector))) }) } @Test def testCosineSimilarityDouble{ val epsilon = 0.0001 val doubleExamples = Map( (DenseVector.ones[Double](5).t, DenseVector.ones[Double](5).t) -> 1.0, (DenseVector(1.0, 0.0, 0.0).t, DenseVector(0.0, 1.0, 0.0).t) -> 0.0, (DenseVector(1.0, 1.0, 0.0).t, DenseVector(0.0, 1.0, 1.0).t) -> 0.5, (DenseVector(1.0, 1.0, 0.0, 0.0, 0.0, 0.0).t, DenseVector(0.0, 1.0, 1.0, 0.0, 0.0, 0.0).t) -> 0.5 ) doubleExamples.keys.foreach( vectors => { val sim = cosineSimilarity(vectors._1, vectors._2) printf("%-30s=%30s (+/-(%s)) \n",doubleExamples(vectors), sim, epsilon) assert((sim - doubleExamples(vectors)) < epsilon) }) } @Test def testCosineSimilarityFloat{ val epsilon = 0.0001 val doubleExamples = Map( (DenseVector.ones[Float](5).t, DenseVector.ones[Float](5).t) -> 1.0.toFloat ) doubleExamples.keys.foreach( vectors => { val sim = cosineSimilarity(vectors._1, vectors._2) printf("%-30s=%30s (+/-(%s)) \n",doubleExamples(vectors), sim, epsilon) assert((sim - doubleExamples(vectors)) < epsilon) }) } @Test(expected = classOf[IllegalArgumentException]) def testCosineSimilarityThrowsOnWrongDimensions{ printf("Testing that cosine similarity fails on dimension mismatch..") val epsilon = 0.0001 val doubleExamples = Map( (DenseVector.ones[Float](6).t, DenseVector.ones[Float](5).t) -> 1.0.toFloat ) doubleExamples.keys.foreach( vectors => { val sim = cosineSimilarity(vectors._1, vectors._2) printf("%-30s=%30s (+/-(%s)) \n",doubleExamples(vectors), sim, epsilon) assert((sim - doubleExamples(vectors)) < epsilon) }) } }
Example 115
Source File: WeightedLabeledPoint.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.math.stats.regression import breeze.linalg.DenseVector import org.apache.spark.mllib.random.RandomRDDs import org.apache.spark.rdd.RDD import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext case class WeightedLabeledPoint(label: Double, weight: Double, features: DenseVector[Double]) { def generateSampleData(sc: SparkContext, weights: DenseVector[Double], intercept: Double, numRows: Long = 100L, numPartitions: Int = 4, errorScalar: Double = 1.0, seed: Long = 1L): RDD[WeightedLabeledPoint] = { val len = weights.length + 2 // The last entry will serve as the weight of point and the second last entry will serve // as noisy of the label. val data = RandomRDDs.normalVectorRDD(sc, numRows, len, numPartitions, seed) data.map { d => val fw = d.toArray val x = new DenseVector(fw.dropRight(2)) WeightedLabeledPoint( weights.dot(x) + intercept + errorScalar * fw(len - 2), Math.abs(fw(len - 1)) + 0.5, x ) } } }
Example 116
Source File: OLSMultipleLinearRegression.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.math.stats.regression import org.apache.spark.rdd.RDD import breeze.linalg.{ DenseMatrix, DenseVector } object OLSMultipleLinearRegression { def regression(input: RDD[WeightedLabeledPoint], intercept: Boolean = true): LinearRegressionModel = { // Try to get the number of columns val nCols = if (intercept) { input.first.features.length + 1 } else { input.first.features.length } val (xx, xy, swx, srwsl, ssrw, wsl, sw, n, lw) = input.treeAggregate(( new DenseMatrix[Double](nCols, nCols), // 1. Calculate a k-by-k matrix X^TX. new DenseVector[Double](nCols), // 2. Calculate a k-dimension vector X^Ty. new DenseVector[Double](nCols), // 3. Calculate a k-dimension vector of weighted sum of X. 0.0, // 4. Calculate the square root weighted sum of labels. 0.0, // 5. Calculate the sum of square root of weights. 0.0, // 6. Calculate the weighted sum of labels. 0.0, // 7. Calculate the sum of weights. 0: Long, // 8. Calculate the length of input. 0.0 // 9. Calculate sum of log weights ))( // U is a pair of matrix and vector and v is a WeightedLabeledPoint. seqOp = (U, v) => { // Append 1.0 at the head for calculating intercept. val x = if (intercept) { DenseVector.vertcat(DenseVector(1.0), v.features) } else { v.features } val wx = x * v.weight val sqrtW = Math sqrt v.weight // Unfortunately, breeze.linalg.DenseVector does not support tensor product. (U._1 += wx.asDenseMatrix.t * x.asDenseMatrix, U._2 += wx * v.label, U._3 += wx, U._4 + v.label * sqrtW, U._5 + sqrtW, U._6 + v.label * v.weight, U._7 + v.weight, U._8 + 1, U._9 + math.log(v.weight)) }, combOp = (U1, U2) => ( U1._1 += U2._1, U1._2 += U2._2, U1._3 += U2._3, U1._4 + U2._4, U1._5 + U2._5, U1._6 + U2._6, U1._7 + U2._7, U1._8 + U2._8, U1._9 + U2._9 ) ) LinearRegressionModel(input, intercept, n, (xx + xx.t) :/ 2.0, xy, swx, srwsl, ssrw, wsl, sw, lw) } }
Example 117
Source File: RegressionSummarizer.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd.function.summarize.summarizer import breeze.linalg.{ DenseVector, DenseMatrix } case class RegressionRow( time: Long, x: Array[Double], y: Double, weight: Double ) object RegressionSummarizer { protected[summarizer] def computeAkaikeIC( beta: DenseVector[Double], logLikelihood: Double, shouldIntercept: Boolean ): Double = { val k = beta.length -2.0 * logLikelihood + 2.0 * k } protected[summarizer] def computeResidualSumOfSquares( beta: DenseVector[Double], sumOfYSquared: Double, vectorOfXY: DenseVector[Double], matrixOfXX: DenseMatrix[Double] ): Double = { val k = beta.length require(matrixOfXX.rows == matrixOfXX.cols && vectorOfXY.length == k && matrixOfXX.cols == k) var residualSumOfSquares = sumOfYSquared var i = 0 while (i < beta.length) { var rss = -2.0 * vectorOfXY(i) rss += beta(i) * matrixOfXX(i, i) var j = 0 while (j < i) { rss += 2.0 * beta(j) * matrixOfXX(i, j) j = j + 1 } residualSumOfSquares += rss * beta(i) i = i + 1 } residualSumOfSquares } protected[summarizer] def computeRSquared( sumOfYSquared: Double, sumOfWeights: Double, sumOfY: Double, residualSumOfSquares: Double, shouldIntercept: Boolean ): Double = if (sumOfYSquared == 0.0 || sumOfWeights == 0.0) { Double.NaN } else { val meanOfY = sumOfY / sumOfWeights var varianceOfY = sumOfYSquared / sumOfWeights if (shouldIntercept) { varianceOfY -= meanOfY * meanOfY } (varianceOfY - residualSumOfSquares / sumOfWeights) / varianceOfY } }
Example 118
Source File: RegressionSummarizerSpec.scala From flint with Apache License 2.0 | 5 votes |
package com.twosigma.flint.rdd.function.summarize.summarizer import breeze.linalg.{ DenseVector, DenseMatrix } import org.scalatest.FlatSpec class RegressionSummarizerSpec extends FlatSpec { "RegressionSummarizer" should "transform from RegressRow correctly" in { val x: Array[RegressionRow] = Array( RegressionRow(time = 0L, x = Array(1d, 2d), y = 3d, weight = 4d), RegressionRow(time = 0L, x = Array(4d, 5d), y = 6d, weight = 16d) ) val (response1, predictor1, yw1) = RegressionSummarizer.transform(x, shouldIntercept = true, isWeighted = true) assert(response1.equals(DenseMatrix(Array(2d, 2d, 4d), Array(4d, 16d, 20d)))) assert(predictor1.equals(DenseVector(Array(6d, 24d)))) assert(yw1.deep == Array((3d, 4d), (6d, 16d)).deep) val (response2, predictor2, yw2) = RegressionSummarizer.transform(x, shouldIntercept = true, isWeighted = false) assert(response2.equals(DenseMatrix(Array(1d, 1d, 2d), Array(1d, 4d, 5d)))) assert(predictor2.equals(DenseVector(Array(3d, 6d)))) assert(yw2.deep == Array((3d, 1d), (6d, 1d)).deep) val (response3, predictor3, yw3) = RegressionSummarizer.transform(x, shouldIntercept = false, isWeighted = true) assert(response3.equals(DenseMatrix(Array(2d, 4d), Array(16d, 20d)))) assert(predictor3.equals(DenseVector(Array(6d, 24d)))) assert(yw3.deep == Array((3d, 4d), (6d, 16d)).deep) val (response4, predictor4, yw4) = RegressionSummarizer.transform(x, shouldIntercept = false, isWeighted = false) assert(response4.equals(DenseMatrix(Array(1d, 2d), Array(4d, 5d)))) assert(predictor4.equals(DenseVector(Array(3d, 6d)))) assert(yw4.deep == Array((3d, 1d), (6d, 1d)).deep) } }
Example 119
Source File: GPModelTest.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl import breeze.linalg.{DenseMatrix, DenseVector} import breeze.numerics.{cos, pow} import com.tencent.angel.spark.automl.tuner.kernel.Matern5Iso import com.tencent.angel.spark.automl.tuner.model.GPModel import org.scalatest.FunSuite class GPModelTest extends FunSuite { test("test_linear") { // Test linear: y=2*x val X = DenseMatrix((1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)).t val y = 2.0 * DenseVector(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0) val z = DenseMatrix((2.5, 4.5, 6.5, 8.5, 10.0, 12.0)).t val truePredZ = 2.0 * DenseVector(2.5, 4.5, 6.5, 8.5, 10.0, 12.0) val covFunc = Matern5Iso() val initCovParams = DenseVector(1.0, 1.0) val initNoiseStdDev = 0.01 val gpModel = GPModel(covFunc, initCovParams, initNoiseStdDev) gpModel.fit(X, y) println("Fitted covariance function params:") println(gpModel.covParams) println("Fitted noiseStdDev:") println(gpModel.noiseStdDev) println("\n") val prediction = gpModel.predict(z) println("Mean and Var:") println(prediction) println("True value:") println(truePredZ) } test("test_cosine") { // Test no_linear: y=cos(x)+1 val X = DenseMatrix((1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)).t val y = cos(DenseVector(1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)) + 1.0 val z = DenseMatrix((2.5, 4.5, 6.5, 8.5, 10.0, 12.0)).t val truePredZ = cos(DenseVector(2.5, 4.5, 6.5, 8.5, 10.0, 12.0)) + 1.01 val covFunc = Matern5Iso() val initCovParams = DenseVector(1.0, 1.0) val initNoiseStdDev = 0.01 val gpModel = GPModel(covFunc, initCovParams, initNoiseStdDev) gpModel.fit(X, y) println("Fitted covariance function params:") println(gpModel.covParams) println("Fitted noiseStdDev:") println(gpModel.noiseStdDev) println("\n") val prediction = gpModel.predict(z) println("Mean and Var:") println(prediction) println("True value:") println(truePredZ) } test("testSquare") { // Test no_linear: y=x^2 val X = DenseMatrix((1.0, 2.0, 3.0, 4.0, 5.0, 6.0, 7.0, 8.0, 9.0)).t val y = DenseVector(1.0, 4.0, 9.0, 16.0, 25.0, 36.0, 49.0, 64.0, 81.0) val z = DenseMatrix((2.5, 4.5, 6.5, 8.5, 10.0, 12.0)).t val truePredZ = pow(z, 2) val covFunc = Matern5Iso() val initCovParams = DenseVector(1.0, 1.0) val initNoiseStdDev = 0.01 val gpModel = GPModel(covFunc, initCovParams, initNoiseStdDev) gpModel.fit(X, y) println("Fitted covariance function params:") println(gpModel.covParams) println("Fitted noiseStdDev:") println(gpModel.noiseStdDev) println("\n") val prediction = gpModel.predict(z) println("Mean and Var:") println(prediction) println("True value:") println(truePredZ) } }
Example 120
Source File: SquareDistTest.scala From automl with Apache License 2.0 | 5 votes |
package com.tencent.angel.spark.automl import breeze.linalg.{DenseMatrix, DenseVector} import com.tencent.angel.spark.automl.tuner.math.SquareDist import org.junit.Assert._ import org.scalatest.FunSuite class SquareDistTest extends FunSuite { test("test_XX_1D") { val x = DenseVector(1.0, 2.0, 3.0).toDenseMatrix.t val expected = DenseMatrix((0.0, 1.0, 4.0), (1.0, 0.0, 1.0), (4.0, 1.0, 0.0)) assertEquals(expected, SquareDist(x, x)) } test("test_XX_2D") { val x = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)).t val expected = DenseMatrix((0.0, 2.0, 8.0), (2.0, 0.0, 2.0), (8.0, 2.0, 0.0)) assertEquals(expected, SquareDist(x, x)) } test("test_XY_1D") { val x1 = DenseVector(1.0, 2.0, 3.0).toDenseMatrix.t val x2 = DenseVector(4.0, 5.0).toDenseMatrix.t val expected = DenseMatrix((9.0, 16.0), (4.0, 9.0), (1.0, 4.0)) assertEquals(expected, SquareDist(x1, x2)) } test("test_XY_2D") { val x1 = DenseMatrix((1.0, 2.0, 3.0), (4.0, 5.0, 6.0)).t val x2 = DenseMatrix((7.0, 8.0), (9.0, 10.0)).t val expected = DenseMatrix((61.0, 85.0), (41.0, 61.0), (25.0, 41.0)) assertEquals(expected, SquareDist(x1, x2)) } }
Example 121
Source File: X2P.scala From spark-tsne with Apache License 2.0 | 5 votes |
package com.github.saurfang.spark.tsne import breeze.linalg.DenseVector import org.apache.spark.mllib.X2PHelper._ import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.linalg.distributed.{CoordinateMatrix, MatrixEntry, RowMatrix} import org.apache.spark.mllib.rdd.MLPairRDDFunctions._ import org.slf4j.LoggerFactory object X2P { private def logger = LoggerFactory.getLogger(X2P.getClass) def apply(x: RowMatrix, tol: Double = 1e-5, perplexity: Double = 30.0): CoordinateMatrix = { require(tol >= 0, "Tolerance must be non-negative") require(perplexity > 0, "Perplexity must be positive") val mu = (3 * perplexity).toInt //TODO: Expose this as parameter val logU = Math.log(perplexity) val norms = x.rows.map(Vectors.norm(_, 2.0)) norms.persist() val rowsWithNorm = x.rows.zip(norms).map{ case (v, norm) => VectorWithNorm(v, norm) } val neighbors = rowsWithNorm.zipWithIndex() .cartesian(rowsWithNorm.zipWithIndex()) .flatMap { case ((u, i), (v, j)) => if(i < j) { val dist = fastSquaredDistance(u, v) Seq((i, (j, dist)), (j, (i, dist))) } else Seq.empty } .topByKey(mu)(Ordering.by(e => -e._2)) val p_betas = neighbors.map { case (i, arr) => var betamin = Double.NegativeInfinity var betamax = Double.PositiveInfinity var beta = 1.0 val d = DenseVector(arr.map(_._2)) var (h, p) = Hbeta(d, beta) //logInfo("data was " + d.toArray.toList) //logInfo("array P was " + p.toList) // Evaluate whether the perplexity is within tolerance def Hdiff = h - logU var tries = 0 while (Math.abs(Hdiff) > tol && tries < 50) { //If not, increase or decrease precision if (Hdiff > 0) { betamin = beta beta = if (betamax.isInfinite) beta * 2 else (beta + betamax) / 2 } else { betamax = beta beta = if (betamin.isInfinite) beta / 2 else (beta + betamin) / 2 } // Recompute the values val HP = Hbeta(d, beta) h = HP._1 p = HP._2 tries = tries + 1 } //logInfo("array P is " + p.toList) (arr.map(_._1).zip(p.toArray).map { case (j, v) => MatrixEntry(i, j, v) }, beta) } logger.info("Mean value of sigma: " + p_betas.map(x => math.sqrt(1 / x._2)).mean) new CoordinateMatrix(p_betas.flatMap(_._1)) } }
Example 122
Source File: MeanAveragePrecisionEvaluator.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.evaluation import breeze.linalg.DenseVector import org.apache.spark.rdd.RDD import org.apache.spark.SparkContext._ private def getAP(precisions: Array[Double], recalls: Array[Double]) = { var ap = 0.0 val levels = (0 to 10).map(x => x / 10.0) levels.foreach { t => // Find where recalls are greater than t and precision values at those indices val px = recalls.toSeq.zipWithIndex.filter(x => x._1 >= t).map(x => precisions(x._2)) val p = if (px.isEmpty) { 0.0 } else { px.max } ap = ap + p / 11.0 } ap } }
Example 123
Source File: MLlibUtils.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.utils import breeze.linalg.{SparseVector, DenseMatrix, DenseVector} def breezeVectorToMLlib(breezeVector: breeze.linalg.Vector[Double]): org.apache.spark.mllib.linalg.Vector = { breezeVector match { case v: DenseVector[Double] => if (v.offset == 0 && v.stride == 1 && v.length == v.data.length) { new org.apache.spark.mllib.linalg.DenseVector(v.data) } else { new org.apache.spark.mllib.linalg.DenseVector(v.toArray) // Can't use underlying array directly, so make a new one } case v: SparseVector[Double] => if (v.index.length == v.used) { new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index, v.data) } else { new org.apache.spark.mllib.linalg.SparseVector(v.length, v.index.slice(0, v.used), v.data.slice(0, v.used)) } case v: breeze.linalg.Vector[_] => sys.error("Unsupported Breeze vector type: " + v.getClass.getName) } } }
Example 124
Source File: GaussianMixtureModelEstimator.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning.external import breeze.linalg.{convert, DenseMatrix, DenseVector} import keystoneml.nodes.learning.GaussianMixtureModel import org.apache.spark.rdd.RDD import keystoneml.utils.external.EncEval import keystoneml.workflow.Estimator def fit(samples: Array[DenseVector[Double]]): GaussianMixtureModel = { val extLib = new EncEval val nDim = samples(0).length // Flatten this thing out. val sampleFloats = samples.map(_.toArray.map(_.toFloat)) val res = extLib.computeGMM(k, nDim, sampleFloats.flatten) val meanSize = k*nDim val varSize = k*nDim val coefSize = k*nDim // Each array region is expected to be centroid-major. val means = convert(new DenseMatrix(nDim, k, res.slice(0, meanSize)), Double) val vars = convert(new DenseMatrix(nDim, k, res.slice(meanSize, meanSize+varSize)), Double) val coefs = convert(new DenseVector(res.slice(meanSize+varSize, meanSize+varSize+coefSize)), Double) new GaussianMixtureModel(means, vars, coefs) } }
Example 125
Source File: Windower.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.images import breeze.linalg.DenseVector import org.apache.spark.rdd.RDD import keystoneml.pipelines.FunctionNode import keystoneml.utils.{ImageMetadata, ChannelMajorArrayVectorizedImage, Image} class Windower( stride: Int, windowSize: Int) extends FunctionNode[RDD[Image], RDD[Image]] { def apply(in: RDD[Image]) = { in.flatMap(getImageWindow) } def getImageWindow(image: Image) = { val xDim = image.metadata.xDim val yDim = image.metadata.yDim val numChannels = image.metadata.numChannels // Start at (0,0) in (x, y) and (0 until xDim - windowSize + 1 by stride).flatMap { x => (0 until yDim - windowSize + 1 by stride).map { y => // Extract the window. val pool = new DenseVector[Double](windowSize * windowSize * numChannels) val startX = x val endX = x + windowSize val startY = y val endY = y + windowSize var c = 0 while (c < numChannels) { var s = startX while (s < endX) { var b = startY while (b < endY) { pool(c + (s-startX)*numChannels + (b-startY)*(endX-startX)*numChannels) = image.get(s, b, c) b = b + 1 } s = s + 1 } c = c + 1 } ChannelMajorArrayVectorizedImage(pool.toArray, ImageMetadata(windowSize, windowSize, numChannels)) } } } }
Example 126
Source File: Pooler.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.images import breeze.linalg.DenseVector import keystoneml.pipelines._ import keystoneml.utils.{ImageMetadata, ChannelMajorArrayVectorizedImage, Image} import keystoneml.workflow.Transformer class Pooler( stride: Int, poolSize: Int, pixelFunction: Double => Double, poolFunction: DenseVector[Double] => Double) extends Transformer[Image, Image] { val strideStart = poolSize / 2 def apply(image: Image) = { val xDim = image.metadata.xDim val yDim = image.metadata.yDim val numChannels = image.metadata.numChannels val numPoolsX = math.ceil((xDim - strideStart).toDouble / stride).toInt val numPoolsY = math.ceil((yDim - strideStart).toDouble / stride).toInt val patch = new Array[Double]( numPoolsX * numPoolsY * numChannels) // Start at strideStart in (x, y) and for (x <- strideStart until xDim by stride; y <- strideStart until yDim by stride) { // Extract the pool. Then apply the pixel and pool functions val pool = DenseVector.zeros[Double](poolSize * poolSize) val startX = x - poolSize/2 val endX = math.min(x + poolSize/2, xDim) val startY = y - poolSize/2 val endY = math.min(y + poolSize/2, yDim) var c = 0 while (c < numChannels) { var s = startX while (s < endX) { var b = startY while (b < endY) { pool((s-startX) + (b-startY)*(endX-startX)) = pixelFunction(image.get(s, b, c)) b = b + 1 } s = s + 1 } patch(c + (x - strideStart)/stride * numChannels + (y - strideStart)/stride * numPoolsX * numChannels) = poolFunction(pool) c = c + 1 } } ChannelMajorArrayVectorizedImage(patch, ImageMetadata(numPoolsX, numPoolsY, numChannels)) } }
Example 127
Source File: SignedHellingerMapper.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.stats import breeze.linalg.{DenseVector, DenseMatrix} import breeze.numerics._ import keystoneml.workflow.Transformer object SignedHellingerMapper extends Transformer[DenseVector[Double], DenseVector[Double]] { def apply(in: DenseVector[Double]): DenseVector[Double] = { signum(in) :* sqrt(abs(in)) } } object BatchSignedHellingerMapper extends Transformer[DenseMatrix[Float], DenseMatrix[Float]] { def apply(in: DenseMatrix[Float]): DenseMatrix[Float] = { in.map(x => (math.signum(x) * math.sqrt(math.abs(x))).toFloat) } }
Example 128
Source File: StandardScaler.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.stats import breeze.linalg.DenseVector import breeze.numerics.sqrt import org.apache.spark.mllib.stat.MultivariateOnlineSummarizer import org.apache.spark.rdd.RDD import keystoneml.utils.MLlibUtils import keystoneml.workflow.{Transformer, Estimator} override def fit(data: RDD[DenseVector[Double]]): StandardScalerModel = { val summary = data.treeAggregate(new MultivariateOnlineSummarizer)( (aggregator, data) => aggregator.add(MLlibUtils.breezeVectorToMLlib(data)), (aggregator1, aggregator2) => aggregator1.merge(aggregator2)) if (normalizeStdDev) { new StandardScalerModel( MLlibUtils.mllibVectorToDenseBreeze(summary.mean), Some(sqrt(MLlibUtils.mllibVectorToDenseBreeze(summary.variance)) .map(r => if (r.isNaN | r.isInfinite | math.abs(r) < eps) 1.0 else r))) } else { new StandardScalerModel( MLlibUtils.mllibVectorToDenseBreeze(summary.mean), None) } } }
Example 129
Source File: ClassLabelIndicators.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.util import breeze.linalg.DenseVector import org.apache.spark.rdd.RDD import keystoneml.pipelines._ import keystoneml.workflow.Transformer case class ClassLabelIndicatorsFromIntArrayLabels(numClasses: Int, validate: Boolean = false) extends Transformer[Array[Int], DenseVector[Double]] { assert(numClasses > 1, "numClasses must be > 1.") def apply(in: Array[Int]): DenseVector[Double] = { if(validate && (in.max >= numClasses || in.min < 0)) { throw new RuntimeException("Class labels are expected to be in the range [0, numClasses)") } val indicatorVector = DenseVector.fill(numClasses, -1.0) var i = 0 while (i < in.length) { indicatorVector(in(i)) = 1.0 i += 1 } indicatorVector } }
Example 130
Source File: VectorSplitter.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.util import breeze.linalg.DenseVector import org.apache.spark.rdd.RDD import keystoneml.pipelines.FunctionNode class VectorSplitter( blockSize: Int, numFeaturesOpt: Option[Int] = None) extends FunctionNode[RDD[DenseVector[Double]], Seq[RDD[DenseVector[Double]]]] { override def apply(in: RDD[DenseVector[Double]]): Seq[RDD[DenseVector[Double]]] = { val numFeatures = numFeaturesOpt.getOrElse(in.first.length) val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt (0 until numBlocks).map { blockNum => in.map { vec => // Expliclity call toArray as breeze's slice is lazy val end = math.min(numFeatures, (blockNum + 1) * blockSize) DenseVector(vec.slice(blockNum * blockSize, end).toArray) } } } def splitVector(in: DenseVector[Double]): Seq[DenseVector[Double]] = { val numFeatures = numFeaturesOpt.getOrElse(in.length) val numBlocks = math.ceil(numFeatures.toDouble / blockSize).toInt (0 until numBlocks).map { blockNum => // Expliclity call toArray as breeze's slice is lazy val end = math.min(numFeatures, (blockNum + 1) * blockSize) DenseVector(in.slice(blockNum * blockSize, end).toArray) } } }
Example 131
Source File: TimitFeaturesDataLoader.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.loaders import breeze.linalg.DenseVector import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import scala.collection.mutable def apply(sc: SparkContext, trainDataLocation: String, trainLabelsLocation: String, testDataLocation: String, testLabelsLocation: String, numParts: Int = 512): TimitFeaturesData = { val trainData = CsvDataLoader(sc, trainDataLocation, numParts) val trainLabels = createLabelsRDD(parseSparseLabels(trainLabelsLocation), trainData) val testData = CsvDataLoader(sc, testDataLocation, numParts) val testLabels = createLabelsRDD(parseSparseLabels(testLabelsLocation), testData) TimitFeaturesData(LabeledData(trainLabels.zip(trainData)), LabeledData(testLabels.zip(testData))) } }
Example 132
Source File: LinearPixels.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.pipelines.images.cifar import breeze.linalg.DenseVector import keystoneml.evaluation.MulticlassClassifierEvaluator import keystoneml.loaders.CifarLoader import keystoneml.nodes.images.{GrayScaler, ImageExtractor, ImageVectorizer, LabelExtractor} import keystoneml.nodes.learning.LinearMapEstimator import keystoneml.nodes.util.{Cacher, ClassLabelIndicatorsFromIntLabels, MaxClassifier} import org.apache.spark.{SparkConf, SparkContext} import keystoneml.pipelines.Logging import scopt.OptionParser import keystoneml.utils.Image import keystoneml.workflow.Pipeline object LinearPixels extends Logging { val appName = "LinearPixels" case class LinearPixelsConfig(trainLocation: String = "", testLocation: String = "") def run(sc: SparkContext, config: LinearPixelsConfig): Pipeline[Image, Int] = { val numClasses = 10 // Load and cache the training data. val trainData = CifarLoader(sc, config.trainLocation).cache() val trainImages = ImageExtractor(trainData) val labelExtractor = LabelExtractor andThen ClassLabelIndicatorsFromIntLabels(numClasses) andThen new Cacher[DenseVector[Double]] val trainLabels = labelExtractor(trainData) // A featurizer maps input images into vectors. For this pipeline, we'll also convert the image to grayscale. // We then estimate our model by calling a linear solver on our data. val predictionPipeline = GrayScaler andThen ImageVectorizer andThen (new LinearMapEstimator, trainImages, trainLabels) andThen MaxClassifier // Calculate training error. val evaluator = new MulticlassClassifierEvaluator(numClasses) val trainEval = evaluator.evaluate(predictionPipeline(trainImages), LabelExtractor(trainData)) // Do testing. val testData = CifarLoader(sc, config.testLocation) val testImages = ImageExtractor(testData) val testLabels = labelExtractor(testData) val testEval = evaluator.evaluate(predictionPipeline(testImages), LabelExtractor(testData)) logInfo(s"Training accuracy: \n${trainEval.totalAccuracy}") logInfo(s"Test accuracy: \n${testEval.totalAccuracy}") predictionPipeline } def parse(args: Array[String]): LinearPixelsConfig = new OptionParser[LinearPixelsConfig](appName) { head(appName, "0.1") help("help") text("prints this usage text") opt[String]("trainLocation") required() action { (x,c) => c.copy(trainLocation=x) } opt[String]("testLocation") required() action { (x,c) => c.copy(testLocation=x) } }.parse(args, LinearPixelsConfig()).get def main(args: Array[String]) = { val appConfig = parse(args) val conf = new SparkConf().setAppName(appName) conf.setIfMissing("spark.master", "local[2]") // This is a fallback if things aren't set via spark submit. val sc = new SparkContext(conf) run(sc, appConfig) sc.stop() } }
Example 133
Source File: MeanAveragePrecisionSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.evaluation import breeze.linalg.DenseVector import org.scalatest.FunSuite import org.apache.spark.SparkContext import keystoneml.utils.Stats import keystoneml.workflow.PipelineContext class MeanAveragePrecisionSuite extends FunSuite with PipelineContext { test("random map test") { sc = new SparkContext("local", "test") // Build some random test data with 4 classes 0,1,2,3 val actual = List(Array(0, 3), Array(2), Array(1, 2), Array(0)) val actualRdd = sc.parallelize(actual) val predicted = List( DenseVector(0.1, -0.05, 0.12, 0.5), DenseVector(-0.23, -0.45, 0.23, 0.1), DenseVector(-0.34, -0.32, -0.66, 1.52), DenseVector(-0.1, -0.2, 0.5, 0.8)) val predictedRdd = sc.parallelize(predicted) val map = new MeanAveragePrecisionEvaluator(4).evaluate(predictedRdd, actualRdd) // Expected values from running this in MATLAB val expected = DenseVector(1.0, 0.3333, 0.5, 0.3333) assert(Stats.aboutEq(map, expected, 1e-4)) } }
Example 134
Source File: BlockLinearMapperSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.learning import breeze.linalg.{DenseVector, DenseMatrix} import breeze.stats.distributions.Rand import keystoneml.workflow.PipelineContext import scala.collection.mutable.ArrayBuffer import org.scalatest.FunSuite import org.apache.spark.SparkContext import org.apache.spark.rdd.RDD import keystoneml.pipelines._ import keystoneml.utils.Stats class BlockLinearMapperSuite extends FunSuite with PipelineContext with Logging { test("BlockLinearMapper transformation") { sc = new SparkContext("local", "test") val inDims = 1000 val outDims = 100 val numChunks = 5 val numPerChunk = inDims/numChunks val mat = DenseMatrix.rand(inDims, outDims, Rand.gaussian) val vec = DenseVector.rand(inDims, Rand.gaussian) val intercept = DenseVector.rand(outDims, Rand.gaussian) val splitVec = (0 until numChunks).map(i => vec((numPerChunk*i) until (numPerChunk*i + numPerChunk))) val splitMat = (0 until numChunks).map(i => mat((numPerChunk*i) until (numPerChunk*i + numPerChunk), ::)) val linearMapper = new LinearMapper[DenseVector[Double]](mat, Some(intercept)) val blockLinearMapper = new BlockLinearMapper(splitMat, numPerChunk, Some(intercept)) val linearOut = linearMapper(vec) // Test with intercept assert(Stats.aboutEq(blockLinearMapper(vec), linearOut, 1e-4)) // Test the apply and evaluate call val blmOuts = new ArrayBuffer[RDD[DenseVector[Double]]] val splitVecRDDs = splitVec.map { vec => sc.parallelize(Seq(vec), 1) } blockLinearMapper.applyAndEvaluate(splitVecRDDs, (predictedValues: RDD[DenseVector[Double]]) => { blmOuts += predictedValues () } ) // The last blmOut should match the linear mapper's output assert(Stats.aboutEq(blmOuts.last.collect()(0), linearOut, 1e-4)) } }
Example 135
Source File: PoolingSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.images import breeze.linalg.{DenseVector, sum} import keystoneml.nodes._ import org.scalatest.FunSuite import keystoneml.pipelines.Logging import keystoneml.utils.{ChannelMajorArrayVectorizedImage, ImageMetadata} class PoolingSuite extends FunSuite with Logging { test("pooling") { val imgArr = (0 until 4).flatMap { x => (0 until 4).flatMap { y => (0 until 1).map { c => (c + x * 1 + y * 4 * 1).toDouble } } }.toArray val image = new ChannelMajorArrayVectorizedImage(imgArr, ImageMetadata(4, 4, 1)) val pooling = new Pooler(2, 2, x => x, x => x.max) val poolImage = pooling(image) assert(poolImage.get(0, 0, 0) === 5.0) assert(poolImage.get(0, 1, 0) === 7.0) assert(poolImage.get(1, 0, 0) === 13.0) assert(poolImage.get(1, 1, 0) === 15.0) } test("pooling odd") { val hogImgSize = 14 val convSizes = List(1, 2, 3, 4, 6, 8) convSizes.foreach { convSize => val convResSize = hogImgSize - convSize + 1 val imgArr = (0 until convResSize).flatMap { x => (0 until convResSize).flatMap { y => (0 until 1000).map { c => (c + x * 1 + y * 4 * 1).toDouble } } }.toArray val image = new ChannelMajorArrayVectorizedImage( imgArr, ImageMetadata(convResSize, convResSize, 1000)) val poolSizeReqd = math.ceil(convResSize / 2.0).toInt // We want poolSize to be even !! val poolSize = (math.ceil(poolSizeReqd / 2.0) * 2).toInt // overlap as little as possible val poolStride = convResSize - poolSize println(s"VALUES: $convSize $convResSize $poolSizeReqd $poolSize $poolStride") def summ(x: DenseVector[Double]): Double = sum(x) val pooling = new Pooler(poolStride, poolSize, identity, summ) val poolImage = pooling(image) } } }
Example 136
Source File: ClassLabelIndicatorsSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.util import breeze.linalg.DenseVector import org.scalatest.FunSuite class ClassLabelIndicatorsSuite extends FunSuite { test("single label indicators") { intercept[AssertionError] { val zerolabels = ClassLabelIndicatorsFromIntLabels(0) } intercept[AssertionError] { val onelabel = ClassLabelIndicatorsFromIntLabels(1) } val fivelabel = ClassLabelIndicatorsFromIntLabels(5) assert(fivelabel(2) === DenseVector(-1.0,-1.0,1.0,-1.0,-1.0)) intercept[RuntimeException] { fivelabel(5) } } test("multiple label indicators without validation") { intercept[AssertionError] { val zerolabels = ClassLabelIndicatorsFromIntArrayLabels(0) } intercept[AssertionError] { val onelabel = ClassLabelIndicatorsFromIntArrayLabels(1) } val fivelabel = ClassLabelIndicatorsFromIntArrayLabels(5) assert(fivelabel(Array(2,1)) === DenseVector(-1.0,1.0,1.0,-1.0,-1.0)) intercept[IndexOutOfBoundsException] { fivelabel(Array(4,6)) } assert(fivelabel(Array(-1,2)) === DenseVector(-1.0,-1.0,1.0,-1.0,1.0), "In the unchecked case, we should get weird behavior.") } test("multiple label indicators with validation") { intercept[AssertionError] { val zerolabels = ClassLabelIndicatorsFromIntArrayLabels(0, true) } intercept[AssertionError] { val onelabel = ClassLabelIndicatorsFromIntArrayLabels(1, true) } val fivelabel = ClassLabelIndicatorsFromIntArrayLabels(5, true) assert(fivelabel(Array(2,1)) === DenseVector(-1.0,1.0,1.0,-1.0,-1.0)) intercept[RuntimeException] { fivelabel(Array(4,6)) } intercept[RuntimeException] { fivelabel(Array(-1,2)) } } }
Example 137
Source File: TopKClassifierSuite.scala From keystone with Apache License 2.0 | 5 votes |
package keystoneml.nodes.util import breeze.linalg.DenseVector import org.apache.spark.SparkContext import org.scalatest.FunSuite import keystoneml.workflow.PipelineContext class TopKClassifierSuite extends FunSuite with PipelineContext { test("top k classifier, k <= vector size") { sc = new SparkContext("local", "test") assert(TopKClassifier(2).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3)) assert(TopKClassifier(4).apply(DenseVector(Double.MinValue, Double.MaxValue, 12.0, 11.0, 10.0)) === Array(1, 2, 3, 4)) assert(TopKClassifier(3).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1)) } test("top k classifier, k > vector size") { sc = new SparkContext("local", "test") assert(TopKClassifier(5).apply(DenseVector(-10.0, 42.4, -43.0, 23.0)) === Array(1, 3, 0, 2)) assert(TopKClassifier(2).apply(DenseVector(Double.MinValue)) === Array(0)) assert(TopKClassifier(20).apply(DenseVector(3.0, -23.2, 2.99)) === Array(0, 2, 1)) } }
Example 138
Source File: Dense.scala From jigg with Apache License 2.0 | 5 votes |
package jigg.ml.keras import breeze.linalg.{DenseMatrix, DenseVector} import ucar.nc2.{Variable, Group} class Dense(inputDim: Int, outputDim: Int) extends Functor{ override def functorName = "Dense" override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = { val z = data * w for (i <- 0 until data.rows){ z(i, ::) :+= b.t } z } private val w = DenseMatrix.zeros[Float](inputDim, outputDim) private val b = DenseVector.zeros[Float](outputDim) def h5load(weight: Variable, bias: Variable): Unit = { val weightData = weight.read val weightIndex = weightData.getIndex val biasData = bias.read val biasIndex = biasData.getIndex for(y <- 0 until inputDim) for(x <- 0 until outputDim){ w(y, x) = weightData.getFloat(weightIndex.set(y, x)) if(y == 0) b(x) = biasData.getFloat(biasIndex.set(x)) } } override def toString: String = "Dense: {inputDim: " + inputDim + ", outputDim: " + outputDim + "}" def head: String = w(0 until 2, ::).toString } object Dense{ def apply(inputDim:Int, outputDim:Int) = new Dense(inputDim, outputDim) def apply(configs: Map[String, Any], weightGroups: Group): Dense = { val layerName = configs("name").toString val params = weightGroups.findGroup(layerName) val weightNames = params.findAttribute("weight_names") val weight = params.findVariable(weightNames.getStringValue(0)) val bias = params.findVariable(weightNames.getStringValue(1)) val dims = weight.getDimensions if(dims.size != 2){ throw new IllegalArgumentException("invalid dimension for Dense class") } val d = new Dense(dims.get(0).getLength, dims.get(1).getLength) d.h5load(weight, bias) d } }
Example 139
Source File: Embedding.scala From jigg with Apache License 2.0 | 5 votes |
package jigg.ml.keras import breeze.linalg.{DenseMatrix, DenseVector} import ucar.nc2.{Variable, Group} class Embedding(vocabulary: Int, outDim: Int) extends Functor{ override def functorName = "Embedding" override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = { val arrayOfId = data.reshape(data.size, 1) val length = arrayOfId.size val z = DenseMatrix.zeros[Float](length, outDim) for(i <- 0 until length){ z(i, ::) := w(arrayOfId(i, 0).asInstanceOf[Int]).t } z } private val w = new Array[DenseVector[Float]](vocabulary).map(_ => DenseVector.zeros[Float](outDim)) def h5load(weight: Variable):Unit = { val weightData = weight.read val weightIndex = weightData.getIndex for(y <- 0 until vocabulary) for(x <- 0 until outDim) w(y)(x) = weightData.getFloat(weightIndex.set(y, x)) } } object Embedding{ def apply(vocabulary: Int, outDim: Int) = new Embedding(vocabulary, outDim) def apply(configs: Map[String, Any], weightGroups: Group): Embedding = { val layerName = configs("name").toString val params = weightGroups.findGroup(layerName) val weightNames = params.findAttribute("weight_names") val weight = params.findVariable(weightNames.getStringValue(0)) val dims = weight.getDimensions if(dims.size != 2){ throw new IllegalArgumentException("Invalid dimension for Embedding class") } val e = new Embedding(dims.get(0).getLength, dims.get(1).getLength) e.h5load(weight) e } }
Example 140
Source File: Convolution1D.scala From jigg with Apache License 2.0 | 5 votes |
package jigg.ml.keras import breeze.linalg.{DenseMatrix, DenseVector} import ucar.nc2.{Variable, Group} // Convolution operator for filtering neighborhoods of one-dimensional inputs. class Convolution1D(outCh: Int, width: Int, inputDim: Int, padding: Boolean) extends Functor{ override def functorName = "Convolution1D" override final def convert(data: DenseMatrix[Float]): DenseMatrix[Float] = { val work = im2col(data) * w for (i <- 0 until work.rows) work(i, ::) :+= b.t work } private val w = DenseMatrix.zeros[Float](width * inputDim, outCh) private val b = DenseVector.zeros[Float](outCh) private val paddingRow: Int = if (padding) { (width - 1) / 2 } else { 0 } private def im2col(x: DenseMatrix[Float]): DenseMatrix[Float] = { val inputSize = width * inputDim val work = DenseMatrix.zeros[Float](x.rows, inputSize) val x1 = x.rows for(k1 <- 0 until x1) for(d2 <- 0 until width) for(d1 <- 0 until inputDim) { val i1 = k1 - paddingRow + d2 val j1 = d1 + d2 * inputDim if (i1 >= 0 & i1 < x1) work(k1, j1) = x(i1, d1) else work(k1, j1) = 0.0.toFloat } work } private def h5load(weight: Variable, bias: Variable): Unit = { val weightData = weight.read val weightIndex = weightData.getIndex val biasData = bias.read val biasIndex = biasData.getIndex for(i <- 0 until width) for(j <- 0 until inputDim) for(x <- 0 until outCh){ val y = i * inputDim + j w(y, x) = weightData.getFloat(weightIndex.set(i, 0, j, x)) if(y == 0) b(x) = biasData.getFloat(biasIndex.set(x)) } } override def toString: String = "Convolution1D: {outCh: " + outCh + ", width: " + width + ", inputDim: " + inputDim + ", padding" + padding + "}" } object Convolution1D{ def apply(outCh: Int, width: Int, inputDim: Int, padding: Boolean) = new Convolution1D(outCh, width, inputDim, padding) def apply(configs: Map[String, Any], weightGroups: Group): Convolution1D = { val layerName = configs("name").toString val params = weightGroups.findGroup(layerName) val weightNames = params.findAttribute("weight_names") val borderMode = configs("border_mode").toString match { case "same" => true case _ => false } val weight = params.findVariable(weightNames.getStringValue(0)) val bias = params.findVariable(weightNames.getStringValue(1)) val dims = weight.getDimensions if(dims.size != 4){ throw new IllegalArgumentException("invalid dimension for Convolution1D class") } val c = new Convolution1D(dims.get(3).getLength, dims.get(0).getLength, dims.get(2).getLength, borderMode) c.h5load(weight,bias) c } }
Example 141
Source File: Scaling.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.util import breeze.linalg.DenseVector import breeze.numerics.sqrt import org.apache.spark.ml.feature.LabeledPoint import org.apache.spark.ml.linalg.Vectors import org.apache.spark.rdd.RDD private[ml] trait Scaling { def scale(data: RDD[LabeledPoint]) = { val x = data.map(x => DenseVector(x.features.toArray)).cache() val y = data.map(_.label) val n = x.count().toDouble val mean = x.reduce(_ + _) / n val centered = x.map(_ - mean).cache() val variance = centered.map(xx => xx *:* xx).reduce(_ + _) / n x.unpersist() val varianceNoZeroes = variance.map(v => if (v > 0d) v else 1d) val scaled = centered.map(_ /:/ sqrt(varianceNoZeroes)).map(_.toArray).map(Vectors.dense).zip(y).map { case(f, y) => LabeledPoint(y, f) }.cache() if (scaled.count() > 0) // ensure scaled is materialized centered.unpersist() scaled } }
Example 142
Source File: RBFKernelTest.scala From spark-gp with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.commons.kernel import breeze.linalg.{DenseMatrix, DenseVector, all} import breeze.numerics.abs import org.apache.spark.ml.linalg.Vectors import org.scalatest.FunSuite class RBFKernelTest extends FunSuite { test("Calling `trainingKernel` before `setTrainingVectors` " + "yields `TrainingVectorsNotInitializedException") { val rbf = new RBFKernel() assertThrows[TrainingVectorsNotInitializedException] { rbf.trainingKernel() } } test("Calling `derivative` before `setTrainingVectors` " + "yields `TrainingVectorsNotInitializedException") { val rbf = new RBFKernel() assertThrows[TrainingVectorsNotInitializedException] { rbf.trainingKernelAndDerivative() } } private val dataset = Array(Array(1d, 2d), Array(2d, 3d), Array(5d, 7d)).map(Vectors.dense) test("being called after `setTrainingVector`," + " `trainingKernel` should return the correct kernel matrix") { val rbf = new RBFKernel(math.sqrt(0.2)) rbf.setTrainingVectors(dataset) val correctKernelMatrix = DenseMatrix((1.000000e+00, 6.737947e-03, 3.053624e-45), (6.737947e-03, 1.000000e+00, 7.187782e-28), (3.053624e-45, 7.187782e-28, 1.000000e+00)) assert(all(abs(rbf.trainingKernel() - correctKernelMatrix) <:< 1e-4)) } private def computationalDerivative(sigma: Double, h: Double) = { val rbfLeft = new RBFKernel(sigma - h) val rbfRight = new RBFKernel(sigma + h) rbfLeft.setTrainingVectors(dataset) rbfRight.setTrainingVectors(dataset) (rbfRight.trainingKernel() - rbfLeft.trainingKernel()) / (2 * h) } test("being called after `setTrainingVector`," + " `derivative` should return the correct kernel matrix derivative") { val rbf = new RBFKernel(0.2) rbf.setTrainingVectors(dataset) val analytical = rbf.trainingKernelAndDerivative()._2(0) val computational = computationalDerivative(0.2, 1e-3) assert(all(abs(analytical - computational) <:< 1e-3)) } test("crossKernel returns correct kernel") { val rbf = new RBFKernel(math.sqrt(0.2)) rbf.setTrainingVectors(dataset.drop(1)) val crossKernel = rbf.crossKernel(dataset.take(1)) val correctCrossKernel = DenseMatrix((6.737947e-03, 3.053624e-45)) assert(all(abs(crossKernel - correctCrossKernel) <:< 1e-4)) } test("crossKernel returns correct kernel if called on a single vector") { val rbf = new RBFKernel(math.sqrt(0.2)) rbf.setTrainingVectors(dataset.drop(1)) val crossKernel = rbf.crossKernel(dataset(0)) val correctCrossKernel = DenseVector(6.737947e-03, 3.053624e-45).t assert(all(abs(crossKernel - correctCrossKernel) <:< 1e-4)) } }
Example 143
Source File: Burden.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.assoc import breeze.linalg.DenseVector import breeze.stats.distributions.{Gaussian, StudentsT} import org.dizhang.seqspark.stat.HypoTest.{NullModel => NM} import org.dizhang.seqspark.stat.{Resampling, ScoreTest, WaldTest} import org.dizhang.seqspark.util.General._ import scala.language.existentials @SerialVersionUID(7727280001L) trait Burden extends AssocMethod { def nullModel: NM def x: Encode.Fixed def result: AssocMethod.Result } object Burden { def apply(nullModel: NM, x: Encode.Coding): Burden with AssocMethod.AnalyticTest = { nullModel match { case nm: NM.Fitted => AnalyticScoreTest(nm, x.asInstanceOf[Encode.Fixed]) case _ => AnalyticWaldTest(nullModel, x.asInstanceOf[Encode.Fixed]) } } def apply(ref: Double, min: Int, max: Int, nullModel: NM.Fitted, x: Encode.Coding): ResamplingTest = { ResamplingTest(ref, min, max, nullModel, x.asInstanceOf[Encode.Fixed]) } def getStatistic(nm: NM.Fitted, x: Encode.Coding): Double = { val st = ScoreTest(nm, x.asInstanceOf[Encode.Fixed].coding) st.score(0)/st.variance(0,0).sqrt } def getStatistic(nm: NM, x: DenseVector[Double]): Double = { val wt = WaldTest(nm, x) (wt.beta /:/ wt.std).apply(1) } @SerialVersionUID(7727280101L) final case class AnalyticScoreTest(nullModel: NM.Fitted, x: Encode.Fixed) extends Burden with AssocMethod.AnalyticTest { def geno = x.coding //val scoreTest = ScoreTest(nullModel, geno) val statistic = getStatistic(nullModel, x) val pValue = { val dis = new Gaussian(0.0, 1.0) Some(1.0 - dis.cdf(statistic)) } def result: AssocMethod.BurdenAnalytic = { AssocMethod.BurdenAnalytic(x.vars, statistic, pValue, "test=score") } } case class AnalyticWaldTest(nullModel: NM, x: Encode.Fixed) extends Burden with AssocMethod.AnalyticTest { def geno = x.coding private val wt = WaldTest(nullModel, x.coding) val statistic = getStatistic(nullModel, geno) val pValue = { val dis = new StudentsT(nullModel.dof - 1) Some(1.0 - dis.cdf(statistic)) } def result = { AssocMethod.BurdenAnalytic(x.vars, statistic, pValue, s"test=wald;beta=${wt.beta(1)};betaStd=${wt.std(1)}") } } @SerialVersionUID(7727280201L) final case class ResamplingTest(refStatistic: Double, min: Int, max: Int, nullModel: NM.Fitted, x: Encode.Fixed) extends Burden with AssocMethod.ResamplingTest { def pCount = Resampling.Simple(refStatistic, min, max, nullModel, x, getStatistic).pCount def result: AssocMethod.BurdenResampling = { AssocMethod.BurdenResampling(x.vars, refStatistic, pCount) } } }
Example 144
Source File: HypoTest.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import breeze.linalg.{DenseMatrix, DenseVector, inv, *} import breeze.stats.{mean, variance} ) extends NullModel { def dof = y.length - xs.cols + 1 def residuals = y - estimates val invInfo = inv(xs.t * (xs(::, *) *:* b) * a) } def apply(y: DenseVector[Double], x: Option[DenseMatrix[Double]], fit: Boolean, binary: Boolean): NullModel = { x match { case Some(dm) => apply(y, dm, fit, binary) case None => apply(y, fit, binary) } } def apply(y: DenseVector[Double], fit: Boolean, binary: Boolean): NullModel = { if (fit) { Fit(y, binary) } else { Simple(y, binary) } } def apply(reg: Regression): NullModel = { val y = reg.responses reg match { case lr: LogisticRegression => Fitted(y, reg.estimates, reg.xs, 1.0, lr.residualsVariance, binary = true) case lr: LinearRegression => Fitted(y, reg.estimates, reg.xs, lr.residualsVariance, DenseVector.ones[Double](y.length), binary = false) } } def apply(y: DenseVector[Double], x: DenseMatrix[Double], fit: Boolean, binary: Boolean): NullModel = { if (! fit) { Mutiple(y, x, binary) } else if (binary) { val reg = LogisticRegression(y, x) Fitted(y, reg.estimates, reg.xs, 1.0, reg.residualsVariance, binary) } else { val reg = LinearRegression(y, x) Fitted(y, reg.estimates, reg.xs, reg.residualsVariance, DenseVector.ones[Double](y.length), binary) } } def Fit(y: DenseVector[Double], x: DenseMatrix[Double], binary: Boolean): Fitted = { apply(y, x, fit = true, binary).asInstanceOf[Fitted] } def Fit(y: DenseVector[Double], binary: Boolean): Fitted = { val my = DenseVector.fill(y.length)(mean(y)) val residuals = y - my val xs = DenseMatrix.ones[Double](y.length, 1) val invInfo = DenseMatrix.fill(1,1)(1.0/y.length) val a = if (binary) 1.0 else variance(residuals) val b = if (binary) my.map(e => e * (1 - e)) else DenseVector.ones[Double](y.length) Fitted(y, my, xs, a, b, binary) } } }
Example 145
Source File: LinearCombinationChiSquare.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import breeze.linalg.{DenseVector, sum} import org.dizhang.seqspark.stat.LinearCombinationChiSquare._ @SerialVersionUID(7778520001L) trait LinearCombinationChiSquare extends Serializable { def lambda: DenseVector[Double] def nonCentrality: DenseVector[Double] def degreeOfFreedom: DenseVector[Double] def cdf(cutoff: Double): CDF val meanLambda: Double = sum(lambda) val size: Int = lambda.length } object LinearCombinationChiSquare { @SerialVersionUID(7778550101L) trait CDF extends Serializable { def pvalue: Double def ifault: Int def trace: Array[Double] } }
Example 146
Source File: Resampling.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import breeze.linalg.{DenseVector, shuffle} import breeze.stats.distributions.Bernoulli import org.dizhang.seqspark.assoc.Encode import org.dizhang.seqspark.ds.SemiGroup.PairInt import org.dizhang.seqspark.stat.HypoTest.NullModel import scala.language.existentials def makeNewNullModel: NullModel.Fitted = { val newY = makeNewY() val cols = nullModel.xs.cols NullModel( newY, nullModel.xs(::, 1 until cols), fit = true, binary = nullModel.binary ).asInstanceOf[NullModel.Fitted] } }
Example 147
Source File: ScoreTest.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import breeze.linalg.{*, CSCMatrix, DenseMatrix, DenseVector, SparseVector} import org.dizhang.seqspark.stat.HypoTest.NullModel.{Fitted => SNM} import org.dizhang.seqspark.util.General._ object ScoreTest { def apply(nm: SNM, x: CSCMatrix[Double]): ScoreTest = { Sparse(nm, x) } def apply(nm: SNM, x: DenseMatrix[Double]): ScoreTest = { Dense(nm, x) } def apply(nm: SNM, x: DenseVector[Double]): ScoreTest = { Dense(nm, DenseVector.horzcat(x)) } def apply(nm: SNM, x: SparseVector[Double]): ScoreTest = { Sparse(nm, SparseVector.horzcat(x)) } def apply(nm: SNM, x1: DenseMatrix[Double], x2: CSCMatrix[Double]): ScoreTest = { Mixed(nm, x1, x2) } case class Sparse(nm: SNM, x: CSCMatrix[Double]) extends ScoreTest { val score = (nm.residuals.toDenseMatrix * x).toDenseVector / nm.a lazy val variance = { val c = nm.xs val IccInv = nm.invInfo * nm.a val Igg = (colMultiply(x, nm.b).t * x).toDense val Icg = (c(::, *) *:* nm.b).t * x val Igc = Icg.t (Igg - Igc * IccInv * Icg) / nm.a } } case class Dense(nm: SNM, x: DenseMatrix[Double]) extends ScoreTest { val score = x.t * nm.residuals / nm.a lazy val variance = { val c = nm.xs val IccInv = nm.invInfo * nm.a val Igg = (x(::, *) *:* nm.b).t * x val Icg = (c(::, *) *:* nm.b).t * x val Igc = Icg.t (Igg - Igc * IccInv * Icg)/nm.a } } case class Mixed(nm: SNM, x1: DenseMatrix[Double], x2: CSCMatrix[Double]) extends ScoreTest { private val dense = Dense(nm, x1) private val sparse = Sparse(nm, x2) val score = DenseVector.vertcat(dense.score, sparse.score) lazy val variance = { val v1 = dense.variance val v4 = sparse.variance val v2 = { val c = nm.xs val IccInv = nm.invInfo * nm.a val Igg = (x1(::, *) *:* nm.b).t * x2 val Icg = (c(::, *) *:* nm.b).t * x2 val Igc = x1.t * (c(::, *) *:* nm.b).t (Igg - Igc * IccInv * Icg) / nm.a } val v3 = v2.t val v12 = DenseMatrix.horzcat(v1, v2) val v34 = DenseMatrix.horzcat(v3, v4) DenseMatrix.vertcat(v12, v34) } } case class Mock(score: DenseVector[Double], variance: DenseMatrix[Double]) extends ScoreTest } @SerialVersionUID(7778780001L) sealed trait ScoreTest extends HypoTest { def score: DenseVector[Double] def variance: DenseMatrix[Double] }
Example 148
Source File: Kinship.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import org.apache.spark.rdd.RDD import org.dizhang.seqspark.ds._ import breeze.linalg.{DenseVector, SparseVector, Vector} import org.apache.spark.SparkContext import scala.collection.mutable.ArrayBuffer def removeNums(size: Int, nums: IndexedSeq[Int]): IndexedSeq[Int] = { var j: Int = 0 var i: Int = 0 val res = ArrayBuffer[Int]() while (i < size) { if (j >= nums.length) { res.+=(i) } else if (i == nums(j)) { j += 1 } else { res.+=(i) } i += 1 } res.toIndexedSeq } }
Example 149
Source File: WaldTest.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.stat import breeze.linalg.{DenseMatrix, DenseVector, diag, inv} import breeze.numerics.sqrt import breeze.stats.distributions.StudentsT import org.dizhang.seqspark.stat.HypoTest.NullModel import org.dizhang.seqspark.stat.HypoTest.NullModel._ trait WaldTest { def nm: NullModel def x: DenseVector[Double] def reg: Regression = { nm match { case Simple(y, b) => if (b) LogisticRegression(y, x.toDenseMatrix.t) else LinearRegression(y, x.toDenseMatrix.t) case Mutiple(y, c, b) => if (b) LogisticRegression(y, DenseMatrix.horzcat(x.toDenseMatrix.t, c)) else LinearRegression(y, DenseMatrix.horzcat(x.toDenseMatrix.t, c)) case Fitted(y, _, xs, _, _, b) => if (b) LogisticRegression(y, DenseMatrix.horzcat(x.toDenseMatrix.t, xs(::, 1 until xs.cols))) else LinearRegression(y, DenseMatrix.horzcat(x.toDenseMatrix.t, xs(::, 1 until xs.cols))) } } def beta: DenseVector[Double] = reg.coefficients def std: DenseVector[Double] = { sqrt(diag(inv(reg.information))) } def dof: Int = nm.dof - 1 def t: DenseVector[Double] = beta /:/ std def pValue(oneSided: Boolean = true): DenseVector[Double] = { val dis = new StudentsT(dof) if (oneSided) { t.map(c => 1.0 - dis.cdf(c)) } else { t.map(c => (1.0 - dis.cdf(math.abs(c))) * 2.0) } } } object WaldTest { def apply(nm: NullModel, x: DenseVector[Double]): WaldTest = { Default(nm, x) } case class Default(nm: NullModel, x: DenseVector[Double]) extends WaldTest }
Example 150
Source File: IntegrateSpec.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.numerics import breeze.linalg.DenseVector import breeze.numerics._ import breeze.stats.distributions._ import org.scalatest.{FlatSpec, Matchers} class IntegrateSpec extends FlatSpec with Matchers { def f1(input: DenseVector[Double]): DenseVector[Double] = { val dis = new ChiSquared(1.0) val dis2 = new ChiSquared(14.0) input.map(x => dis.pdf(x) * dis2.cdf(x)) } def sinx(input: DenseVector[Double]): DenseVector[Double] = { sin(input) } def x2(input: DenseVector[Double]): DenseVector[Double] = { pow(input, 2.0) } def time[R](block: => R)(tag: String): R = { val t0 = System.nanoTime() val result = block // call-by-name val t1 = System.nanoTime() println(s"$tag Elapsed time: " + (t1 - t0)/1e6 + "ms") result } "A Integrate" should "be well" in { time{ //val res1 = Integrate(f1, 0.0, 40.0) //println(s"chisq df=1 pdf|0,1: $res1") }("Chisq") //val res2 = Integrate(sinx, 0.0, 1.0) //val res3 = Integrate(x2, 0.0, 1.0) //println(s"sin(x)|0,1: $res2") //println(s"x^2|0,1: $res3") } }
Example 151
Source File: Qk21Spec.scala From seqspark with Apache License 2.0 | 5 votes |
package org.dizhang.seqspark.numerics import breeze.linalg.DenseVector import breeze.numerics.pow import breeze.stats.distributions.ChiSquared import org.scalatest.{FlatSpec, Matchers} class Qk21Spec extends FlatSpec with Matchers { val chisq = ChiSquared(1.0) def f(input: DenseVector[Double]): DenseVector[Double] = { input.map(x => chisq.pdf(x)) } "A Qk21" should "behave well" in { //val res = Qk21(f, 0.0, 1.0) //println(res) } }
Example 152
Source File: VisualLogger.scala From basel-face-pipeline with Apache License 2.0 | 5 votes |
package registration.utils import java.awt.Color import breeze.linalg.DenseVector import scalismo.geometry._3D import scalismo.mesh.TriangleMesh import scalismo.statisticalmodel.StatisticalMeshModel import scalismo.ui.api._ object VisualLogger { var ui : Option[ScalismoUI] = None//Some(ScalismoUI("Visual Logger")) val modelGroup = ui.map(_.createGroup("Model")) var modelView : Option[StatisticalMeshModelViewControls] = None val targetGroup = ui.map(_.createGroup("Target")) var targetMeshView : Option[TriangleMeshView] = None def showTargetMesh(targetMesh : TriangleMesh[_3D]) : Unit = { remove(targetMeshView) targetMeshView = show(VisualLogger.targetGroup, targetMesh, "target") targetMeshView.map(_.color = Color.RED) } def showStatisticalShapeModel(ssm : StatisticalMeshModel) : Unit = { removeModel(modelView) modelView = show(modelGroup, ssm, "gpmodel") modelView.map(_.meshView.opacity = 0.7) } def updateModelView(coeffs : DenseVector[Double]) : Unit = { if (modelView.isDefined) { modelView.get.shapeModelTransformationView.shapeTransformationView.coefficients = coeffs } } private def show[A](group : Option[Group], t : A, name : String)(implicit sic : ShowInScene[A]): Option[sic.View] = { for { ui <- ui g <- group } yield { ui.show(g, t, name) } } def remove[V <: ObjectView](view : Option[V]): Unit = { view.foreach(_.remove()) } def removeModel(view : Option[StatisticalMeshModelViewControls]): Unit = { for {v <- view} { v.meshView.remove() v.shapeModelTransformationView.remove() } } }
Example 153
Source File: PrepareReferenceLandmarks.scala From basel-face-pipeline with Apache License 2.0 | 5 votes |
package preprocessing import breeze.linalg.{DenseMatrix, DenseVector} import ch.unibas.cs.gravis.facepipeline.BU3DDataProvider import ch.unibas.cs.gravis.facepipeline.BU3DDataProvider.Expressions import ch.unibas.cs.gravis.facepipeline.{DataProvider, PipelineStep} import scalismo.faces.io.TLMSLandmarksIO import scalismo.statisticalmodel.MultivariateNormalDistribution object PrepareReferenceLandmarks { def main(args: Array[String]): Unit = { scalismo.initialize() PrepareReferenceLandmarks(BU3DDataProvider).run() } } case class PrepareReferenceLandmarks(dataProvider : DataProvider) extends PipelineStep { override def run(): Unit = { scalismo.initialize() val rawRefLmsFile = (dataProvider.repositoryRoot / "data" / "incoming" / "reference" / "landmarks" / "mean2012_l7_bfm_nomouth.tlms").jfile val referenceLandmarksTLMS = TLMSLandmarksIO.read3D(rawRefLmsFile).get val referenceLandmarks = for (lmTlms <- referenceLandmarksTLMS if lmTlms.visible) yield { val lm = lmTlms.toLandmark val noiseVariance = lm.id.trim match { case lmid if lmid.contains("eyebrow") => 3.0 case lmid if lmid.contains("eye.bottom") => 3.0 case lmid if lmid.contains("eye.top") => 3.0 case _ => 1.0 } lm.copy(uncertainty = Some(MultivariateNormalDistribution(DenseVector.zeros[Double](3), DenseMatrix.eye[Double](3) * noiseVariance))) } // Transfer the reference landmarks to all the expressions and save them. for (expression <- Expressions.expressionModelTypes()) { val neutralRef = dataProvider.incoming.reference.loadMesh(dataProvider.Neutral).get val expressionRef = dataProvider.registration.loadPriorModel(expression).get.referenceMesh val expressionLms = for (lm <- referenceLandmarks) yield { val id = neutralRef.pointSet.findClosestPoint(lm.point).id lm.copy(point = expressionRef.pointSet.point(id)) } dataProvider.incoming.reference.saveLandmarks(expression, expressionLms) } } }
Example 154
Source File: LshTable.scala From lsh-scala with Apache License 2.0 | 5 votes |
package io.krom.lsh import breeze.linalg.DenseVector abstract class LshTable(prefix: Option[String] = None) { def put(hash: String, label: String, point: DenseVector[Double]) def get(hash: String): List[(String, String, DenseVector[Double])] def update(hash: String, label: String, point: DenseVector[Double]) protected def createKey(hash: String): String = { prefix match { case None => hash case Some(p) => p + ":" + hash } } }
Example 155
Source File: InMemoryLshTable.scala From lsh-scala with Apache License 2.0 | 5 votes |
package io.krom.lsh import breeze.linalg.DenseVector import collection.mutable.HashMap import collection.mutable.HashSet import scala.collection.mutable class InMemoryLshTable(prefix: Option[String] = None) extends LshTable(prefix) { private val index = new HashMap[String, HashSet[String]]() private val table = new HashMap[String, (String, String, DenseVector[Double])]() override def put(hash: String, label: String, point: DenseVector[Double]) = { val key = createKey(hash) val value = (label, key, point) if (!index.keySet.contains(key)) index(key) = new HashSet[String]() index(key) += label table(label) = value } override def update(hash: String, label: String, point: DenseVector[Double]) = { val key = createKey(hash) val (_, oldKey, _) = table(label) val newValue = (label, key, point) table(label) = newValue if (key != oldKey) { index(oldKey) -= label if (!index.keySet.contains(key)) index(key) = new mutable.HashSet[String]() index(key) += label } } override def get(hash: String): List[(String, String, DenseVector[Double])] = { val key = createKey(hash) val items = if (index.keySet.contains(key)) index(key) else new HashSet() (for { item <- items } yield table(item)).toList } } object InMemoryLshTable { def createTables(numTables: Int, prefix: Option[String] = None): IndexedSeq[LshTable] = { for { _ <- 1 to numTables } yield new InMemoryLshTable(prefix) } }
Example 156
Source File: RedisLshTable.scala From lsh-scala with Apache License 2.0 | 5 votes |
package io.krom.lsh import breeze.linalg.DenseVector import com.lambdaworks.jacks.JacksMapper import com.redis.RedisClient import scala.collection.immutable.HashMap class RedisLshTable(redisdb: RedisClient, prefix: Option[String] = None) extends LshTable(prefix) { override def put(hash: String, label: String, point: DenseVector[Double]): Unit = { val key = createKey(hash) val value = (label, key, point.toArray) redisdb.pipeline { pipe => pipe.sadd(key, label) pipe.set(label, JacksMapper.writeValueAsString(value)) } } override def update(hash: String, label: String, point: DenseVector[Double]): Unit = { val key = createKey(hash) val item = redisdb.get(label) match { case None => return case Some(x:String) => JacksMapper.readValue[(String, String, Array[Double])](x) } val oldKey = item._2 val value = (label, key, point.toArray) redisdb.pipeline { pipe => pipe.set(label, JacksMapper.writeValueAsString(value)) if (key != oldKey) pipe.srem(oldKey, label) pipe.sadd(key, label) } } override def get(hash: String): List[(String, String, DenseVector[Double])] = { val key = createKey(hash) val items = redisdb.smembers(key) val itemDetails = redisdb.pipeline { pipe => for { item <- items.get if item.isDefined } pipe.get(item.get) } for { item <- itemDetails.get newItem = item match { case Some(x:String) => Some(JacksMapper.readValue[(String, String, Array[Double])](x)) case None => None } if newItem.isDefined } yield ( newItem.get._1, newItem.get._2, DenseVector(newItem.get._3) ) } } object RedisLshTable { def createTables(numTables: Int, redisConf: HashMap[String, String], prefix: Option[String] = None): IndexedSeq[LshTable] = { val redisHost = if (redisConf.contains("host")) redisConf("host") else "localhost" val redisPort = if (redisConf.contains("port")) Integer.parseInt(redisConf("port")) else 6379 for { redisDb <- 0 until numTables } yield new RedisLshTable(new RedisClient(redisHost, redisPort, redisDb), prefix) } }
Example 157
Source File: InMemoryLshTableSpec.scala From lsh-scala with Apache License 2.0 | 5 votes |
package io.krom.lsh import breeze.linalg.DenseVector import org.scalatest.FunSpec import org.scalatest.Matchers._ class InMemoryLshTableSpec extends FunSpec { describe("put without prefix") { it("should return the value just added") { val testPoint1 = DenseVector(0.1, 0.2) val testLabel1 = "point1" val testKey = "testhashkey" val table = new InMemoryLshTable() table.put(testKey, testLabel1, testPoint1) table.get(testKey).length should equal (1) table.get(testKey)(0) should equal (testLabel1, testKey, testPoint1) } it("should return multiple results when more than one value is added") { val testPoint1 = DenseVector(0.1, 0.2) val testLabel1 = "point1" val testPoint2 = DenseVector(0.3, 0.4) val testLabel2 = "point2" val testKey = "testhashkey" val table = new InMemoryLshTable() table.put(testKey, testLabel1, testPoint1) table.put(testKey, testLabel2, testPoint2) table.get(testKey).length should equal (2) val data = table.get(testKey).sortBy(_._1) data(0) should equal (testLabel1, testKey, testPoint1) data(1) should equal (testLabel2, testKey, testPoint2) } } describe("put with prefix") { it("should return the value just added") { val testPoint1 = DenseVector(0.1, 0.2) val testLabel1 = "point1" val testKey = "testhashkey" val testPrefix = "testprefix" val table = new InMemoryLshTable(Some(testPrefix)) table.put(testKey, testLabel1, testPoint1) table.get(testKey).length should equal (1) table.get(testKey)(0) should equal (testLabel1, testPrefix + ":" + testKey, testPoint1) } it("should return multiple results when more than one value is added") { val testPoint1 = DenseVector(0.1, 0.2) val testLabel1 = "point1" val testPoint2 = DenseVector(0.3, 0.4) val testLabel2 = "point2" val testKey = "testhashkey" val testPrefix = "testPrefix" val table = new InMemoryLshTable(Some(testPrefix)) table.put(testKey, testLabel1, testPoint1) table.put(testKey, testLabel2, testPoint2) table.get(testKey).length should equal (2) val data = table.get(testKey).sortBy(_._1) data(0) should equal (testLabel1, testPrefix + ":" + testKey, testPoint1) data(1) should equal (testLabel2, testPrefix + ":" + testKey, testPoint2) } } describe("update") { it("should change the value previously stored") { val testPoint = DenseVector(0.1, 0.2) val testUpdatedPoint = DenseVector(0.3, 0.4) val testKey1 = "testkey1" val testKey2 = "testkey2" val testPrefix = "testPrefix" val testLabel = "testData" val table = new InMemoryLshTable(Some(testPrefix)) table.put(testKey1, testLabel, testPoint) table.get(testKey1).length should equal (1) table.get(testKey1)(0) should equal (testLabel, testPrefix + ":" + testKey1, testPoint) table.update(testKey2, testLabel, testUpdatedPoint) table.get(testKey1).length should equal (0) table.get(testKey2).length should equal (1) table.get(testKey2)(0) should equal (testLabel, testPrefix + ":" + testKey2, testUpdatedPoint) } } }
Example 158
Source File: DistanceFunctionSpec.scala From lsh-scala with Apache License 2.0 | 5 votes |
package io.krom.lsh import breeze.linalg.DenseVector import org.scalatest.{Matchers, FunSpec} import DistanceFunction._ class DistanceFunctionSpec extends FunSpec with Matchers { describe("calculating Euclidean distance score") { it("should equal 1 over 1 plus the square root of the sum of the squares of the sides") { val point1 = DenseVector[Double](1.0, 0.0) val point2 = DenseVector[Double](0.0, 1.0) val point3 = DenseVector[Double](3.0, 0.0) euclideanDistance(point1, point1) should equal (1.0) euclideanDistance(point1, point2) should equal (1.0 / (1.0 + Math.sqrt(2.0))) euclideanDistance(point1, point3) should equal (1.0 / (1.0 + Math.sqrt(4.0))) euclideanDistance(point2, point3) should equal (1.0 / (1.0 + Math.sqrt(10.0))) } } describe("calculating Cosine distance score") { it("should equal 1 minus the cosine of the angle between the vectors") { val point1 = DenseVector(1.0, 0.0) val point2 = DenseVector(0.0, 1.0) val point3 = DenseVector(3.0, 0.0) val point4 = DenseVector(2.0, 3.0) val point5 = DenseVector(1.0, 1.5) val point6 = DenseVector(6.0, 9.0) cosineDistance(point1, point1) should equal (1.0) cosineDistance(point1, point2) should equal (0.0) cosineDistance(point2, point1) should equal (0.0) cosineDistance(point1, point3) should equal (1.0) cosineDistance(point4, point5) should equal (1.0) cosineDistance(point4, point6) should equal (1.0) val point7 = DenseVector(-1.0, 0.0) val point8 = DenseVector(0.0, -1.0) cosineDistance(point1, point7) should equal (1.0) cosineDistance(point1, point8) should equal (0.0) cosineDistance(point7, point8) should equal (0.0) val point9 = DenseVector(0.0, 0.0) cosineDistance(point9, point1).isNaN should be (true) } } }
Example 159
Source File: SparkHdfsLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkHdfsLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val tok = new java.util.StringTokenizer(line, " ") var y = tok.nextToken.toDouble var x = new Array[Double](D) var i = 0 while (i < D) { x(i) = tok.nextToken.toDouble; i += 1 } DataPoint(new DenseVector(x), y) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 2) { System.err.println("Usage: SparkHdfsLR <file> <iters>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkHdfsLR") .getOrCreate() val inputPath = args(0) val lines = spark.read.textFile(inputPath).rdd val points = lines.map(parsePoint).cache() val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 160
Source File: LocalLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- data) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 161
Source File: SparkKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import breeze.linalg.{squaredDistance, DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkKMeans { def parseVector(line: String): Vector[Double] = { DenseVector(line.split(' ').map(_.toDouble)) } def closestPoint(p: Vector[Double], centers: Array[Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 0 until centers.length) { val tempDist = squaredDistance(p, centers(i)) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { if (args.length < 3) { System.err.println("Usage: SparkKMeans <file> <k> <convergeDist>") System.exit(1) } showWarning() val spark = SparkSession .builder .appName("SparkKMeans") .getOrCreate() val lines = spark.read.textFile(args(0)).rdd val data = lines.map(parseVector _).cache() val K = args(1).toInt val convergeDist = args(2).toDouble val kPoints = data.takeSample(withReplacement = false, K, 42) var tempDist = 1.0 while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val pointStats = closest.reduceByKey{case ((p1, c1), (p2, c2)) => (p1 + p2, c1 + c2)} val newPoints = pointStats.map {pair => (pair._1, pair._2._1 * (1.0 / pair._2._2))}.collectAsMap() tempDist = 0.0 for (i <- 0 until K) { tempDist += squaredDistance(kPoints(i), newPoints(i)) } for (newP <- newPoints) { kPoints(newP._1) = newP._2 } println("Finished iteration (delta = " + tempDist + ")") } println("Final centers:") kPoints.foreach(println) spark.stop() } } // scalastyle:on println
Example 162
Source File: LocalFileLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import breeze.linalg.{DenseVector, Vector} object LocalFileLR { val D = 10 // Number of dimensions val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def parsePoint(line: String): DataPoint = { val nums = line.split(' ').map(_.toDouble) DataPoint(new DenseVector(nums.slice(1, D + 1)), nums(0)) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val lines = scala.io.Source.fromFile(args(0)).getLines().toArray val points = lines.map(parsePoint _) val ITERATIONS = args(1).toInt // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) var gradient = DenseVector.zeros[Double](D) for (p <- points) { val scale = (1 / (1 + math.exp(-p.y * (w.dot(p.x)))) - 1) * p.y gradient += p.x * scale } w -= gradient } println("Final w: " + w) } } // scalastyle:on println
Example 163
Source File: SparkLR.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.math.exp import breeze.linalg.{DenseVector, Vector} import org.apache.spark.sql.SparkSession object SparkLR { val N = 10000 // Number of data points val D = 10 // Number of dimensions val R = 0.7 // Scaling factor val ITERATIONS = 5 val rand = new Random(42) case class DataPoint(x: Vector[Double], y: Double) def generateData: Array[DataPoint] = { def generatePoint(i: Int): DataPoint = { val y = if (i % 2 == 0) -1 else 1 val x = DenseVector.fill(D) {rand.nextGaussian + y * R} DataPoint(x, y) } Array.tabulate(N)(generatePoint) } def showWarning() { System.err.println( """WARN: This is a naive implementation of Logistic Regression and is given as an example! |Please use org.apache.spark.ml.classification.LogisticRegression |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val spark = SparkSession .builder .appName("SparkLR") .getOrCreate() val numSlices = if (args.length > 0) args(0).toInt else 2 val points = spark.sparkContext.parallelize(generateData, numSlices).cache() // Initialize w to a random value var w = DenseVector.fill(D) {2 * rand.nextDouble - 1} println("Initial w: " + w) for (i <- 1 to ITERATIONS) { println("On iteration " + i) val gradient = points.map { p => p.x * (1 / (1 + exp(-p.y * (w.dot(p.x)))) - 1) * p.y }.reduce(_ + _) w -= gradient } println("Final w: " + w) spark.stop() } } // scalastyle:on println
Example 164
Source File: LocalKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 165
Source File: PosteriorLandmarkingInteractor.scala From scalismo-ui with GNU General Public License v3.0 | 5 votes |
package scalismo.ui.control.interactor.landmark.complex.posterior import breeze.linalg.DenseVector import scalismo.geometry._ import scalismo.statisticalmodel.MultivariateNormalDistribution import scalismo.ui.control.interactor.landmark.complex.ComplexLandmarkingInteractor import scalismo.ui.control.interactor.landmark.complex.ComplexLandmarkingInteractor.Delegate import scalismo.ui.model._ trait PosteriorLandmarkingInteractor extends ComplexLandmarkingInteractor[PosteriorLandmarkingInteractor] { private lazy val nodeVisibility = frame.sceneControl.nodeVisibility def previewNode: TriangleMeshNode def sourceGpNode: TransformationNode[DiscreteLowRankGpPointTransformation] def previewGpNode: TransformationNode[DiscreteLowRankGpPointTransformation] def targetUncertaintyGroup: GroupNode def targetGroupNode: GroupNode def inversePoseTransform: PointTransformation override protected def initialDelegate: Delegate[PosteriorLandmarkingInteractor] = { PosteriorReadyForCreating.enter() } def updatePreview(modelLm: LandmarkNode, targetLm: LandmarkNode, mousePosition: Point3D): Unit = { targetUncertaintyGroup.genericTransformations.foreach(_.remove()) targetUncertaintyGroup.genericTransformations.add((_: Point[_3D]) => mousePosition, "mousePosition") val lmPointAndId = { previewNode.source.pointSet.findClosestPoint(modelLm.source.point) } val uncertaintyMean = DenseVector(0.0, 0.0, 0.0) val uncertaintyCovModelLm = modelLm.uncertainty.value.toMultivariateNormalDistribution.cov val uncertaintyCovTargetLm = targetLm.uncertainty.value.toMultivariateNormalDistribution.cov val lmUncertainty = MultivariateNormalDistribution(uncertaintyMean, uncertaintyCovModelLm + uncertaintyCovTargetLm) // Here, we need to (inverse) transform the mouse position in order to feed an non-rotated deformation vector to the regression val coeffs = sourceGpNode.transformation.gp.coefficients( IndexedSeq((lmPointAndId.point, inversePoseTransform(mousePosition) - lmPointAndId.point, lmUncertainty)) ) previewGpNode.transformation = sourceGpNode.transformation.copy(coeffs) } def showPreview(): Unit = { nodeVisibility.setVisibility(previewNode, frame.perspective.viewports, show = true) } def hidePreview(): Unit = { nodeVisibility.setVisibility(previewNode, frame.perspective.viewports, show = false) } def initialize(): Unit = { previewNode.pickable.value = false hidePreview() } }
Example 166
Source File: LowRankGpPointTransformation.scala From scalismo-ui with GNU General Public License v3.0 | 5 votes |
package scalismo.ui.model import breeze.linalg.DenseVector import scalismo.common.{DiscreteDomain, NearestNeighborInterpolator} import scalismo.geometry.{_3D, EuclideanVector, Point} import scalismo.statisticalmodel.{DiscreteLowRankGaussianProcess, LowRankGaussianProcess} // This used to be a case class, but since it is extended by the discrete version, it can no longer be. // Therefore, the copy methods have to be defined manually. class LowRankGpPointTransformation protected (val gp: LowRankGaussianProcess[_3D, EuclideanVector[_3D]], val coefficients: DenseVector[Double]) extends PointTransformation { private lazy val vectorField = gp.instance(coefficients) override def apply(point: Point[_3D]): Point[_3D] = { point + vectorField(point) } def copy(coefficients: DenseVector[Double]): LowRankGpPointTransformation = new LowRankGpPointTransformation(gp, coefficients) } object LowRankGpPointTransformation { def apply(gp: LowRankGaussianProcess[_3D, EuclideanVector[_3D]], coefficients: DenseVector[Double]): LowRankGpPointTransformation = new LowRankGpPointTransformation(gp, coefficients) def apply(gp: LowRankGaussianProcess[_3D, EuclideanVector[_3D]]): LowRankGpPointTransformation = apply(gp, DenseVector.zeros[Double](gp.rank)) } class DiscreteLowRankGpPointTransformation private ( val dgp: DiscreteLowRankGaussianProcess[_3D, DiscreteDomain[_3D], EuclideanVector[_3D]], gp: LowRankGaussianProcess[_3D, EuclideanVector[_3D]], coefficients: DenseVector[Double] ) extends LowRankGpPointTransformation(gp, coefficients) { protected def this(dgp: DiscreteLowRankGaussianProcess[_3D, DiscreteDomain[_3D], EuclideanVector[_3D]], coefficients: DenseVector[Double]) = { this(dgp, dgp.interpolate(NearestNeighborInterpolator[_3D, EuclideanVector[_3D]]()), coefficients) } // no need to re-interpolate if the gp didn't change override def copy(coefficients: DenseVector[Double]): DiscreteLowRankGpPointTransformation = new DiscreteLowRankGpPointTransformation(dgp, gp, coefficients) } object DiscreteLowRankGpPointTransformation { def apply( dgp: DiscreteLowRankGaussianProcess[_3D, DiscreteDomain[_3D], EuclideanVector[_3D]] ): DiscreteLowRankGpPointTransformation = apply(dgp, DenseVector.zeros[Double](dgp.rank)) def apply(dgp: DiscreteLowRankGaussianProcess[_3D, DiscreteDomain[_3D], EuclideanVector[_3D]], coefficients: DenseVector[Double]): DiscreteLowRankGpPointTransformation = new DiscreteLowRankGpPointTransformation(dgp, coefficients) }
Example 167
Source File: QuadraticRenyiEntropy.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.prototype import breeze.linalg.DenseVector import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD import io.github.mandar2812.dynaml.kernels.DensityKernel override def entropy(data: List[DenseVector[Double]]): Double = { val dim = data.head.length val root_two: breeze.linalg.Vector[Double] = DenseVector.fill(dim, sqrt(2)) val product = for(i <- data.view; j <- data.view) yield (i, j) -1*log_e(product.map((couple) => { val point1: DenseVector[Double] = couple._1 / sqrt(2.0) val point2: DenseVector[Double] = couple._2 / sqrt(2.0) density.eval(point1 - point2) }).sum) } override def entropy[K](data: RDD[(K, LabeledPoint)]): Double = { val dim = data.first()._2.features.size -1*log_e(data.cartesian(data).map((couple) =>{ val point1: DenseVector[Double] = DenseVector(couple._1._2.features.toArray) / sqrt(2.0) val point2: DenseVector[Double] = DenseVector(couple._2._2.features.toArray) / sqrt(2.0) density.eval(point1 - point2) }).reduce((a,b) => a + b)) } def entropyDifference(entropy: Double, data: List[DenseVector[Double]], add: DenseVector[Double], remove: DenseVector[Double]): Double = { val dim = data.head.length val expEntropy = math.exp(-1.0*entropy) val product1 = for(i <- data.view) yield (remove, i) val subtractEnt = 2*product1.map((couple) => { density.eval((couple._1 - couple._2) / sqrt(2.0)) }).sum - density.eval(DenseVector.zeros(dim)) val product2 = for(i <- data.view) yield (add, i) val addEnt = 2*product2.map((couple) => { density.eval((couple._1 - couple._2) / sqrt(2.0)) }).sum - 2*density.eval((add - remove) / sqrt(2.0)) + density.eval(DenseVector.zeros(dim)) -1.0*log_e(expEntropy + addEnt - subtractEnt) - entropy } }
Example 168
Source File: Metrics.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.evaluation import breeze.linalg.DenseVector import org.apache.spark.rdd.RDD trait Metrics[P] { protected val scoresAndLabels: List[(P, P)] protected var name = "Value" def print(): Unit def generatePlots(): Unit = {} def kpi(): DenseVector[P] def setName(n: String): this.type = { name = n this } } object Metrics{ def apply(task: String) (scoresAndLabels: List[(Double, Double)], length: Int, logFlag: Boolean = false) : Metrics[Double] = task match { case "regression" => new RegressionMetrics(scoresAndLabels, length) case "classification" => new BinaryClassificationMetrics(scoresAndLabels, length, logFlag) } } object MetricsSpark { def apply(task: String) (scoresAndLabels: RDD[(Double, Double)], length: Long, minmax: (Double, Double)) : Metrics[Double] = task match { case "regression" => new RegressionMetricsSpark(scoresAndLabels, length) case "classification" => new BinaryClassificationMetricsSpark(scoresAndLabels, length, minmax) } }
Example 169
Source File: RegressionMetricsSpark.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.evaluation import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.graphics.charts.Highcharts._ import org.apache.log4j.{Priority, Logger} import org.apache.spark.Accumulator import org.apache.spark.broadcast.Broadcast import org.apache.spark.rdd.RDD import scalax.chart.module.ChartFactories.{XYBarChart, XYLineChart, XYAreaChart} histogram(residuals, numBins = 20) title("Histogram of Regression Residuals") } } object RegressionMetricsSpark { def computeKPIs(scoresAndLabels: RDD[(Double, Double)], size: Long) : (Double, Double, Double, Double) = { val mean: Accumulator[Double] = scoresAndLabels.context.accumulator(0.0, "mean") val err:DenseVector[Double] = scoresAndLabels.map((sc) => { val diff = sc._1 - sc._2 mean += sc._2 val difflog = math.pow(math.log(1 + math.abs(sc._1)) - math.log(math.abs(sc._2) + 1), 2) DenseVector(math.abs(diff), math.pow(diff, 2.0), difflog) }).reduce((a,b) => a+b) val SS_res = err(1) val mu: Broadcast[Double] = scoresAndLabels.context.broadcast(mean.value/size.toDouble) val SS_tot = scoresAndLabels.map((sc) => math.pow(sc._2 - mu.value, 2.0)).sum() val rmse = math.sqrt(SS_res/size.toDouble) val mae = err(0)/size.toDouble val rsq = if(1/SS_tot != Double.NaN) 1 - (SS_res/SS_tot) else 0.0 val rmsle = err(2)/size.toDouble (mae, rmse, rsq, rmsle) } }
Example 170
Source File: VectorIIDProbit.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.probability import breeze.linalg.{DenseMatrix, DenseVector, diag} import breeze.stats.distributions.Gaussian override def hessian(y: DenseVector[Double], f: DenseVector[Double]): DenseMatrix[Double] = { diag(DenseVector((y.toArray zip f.toArray).map((couple) => { val n = standardGaussian.pdf(couple._2) val product = couple._1*couple._2 val l = h(product) -1.0*(n*n)/(l*l) - product*n/l }))) } override def gaussianExpectation(normalDistParams: (DenseVector[Double], DenseVector[Double])): DenseVector[Double] = { DenseVector((normalDistParams._1.toArray zip normalDistParams._2.toArray).map((couple) => { val gamma = math.sqrt(1.0 + couple._2) standardGaussian.pdf(couple._1/gamma) })) } }
Example 171
Source File: StudentsTRV.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.probability import breeze.linalg.{DenseMatrix, DenseVector} import breeze.stats.distributions.{ContinuousDistr, Moments, StudentsT} import io.github.mandar2812.dynaml.algebra.{PartitionedPSDMatrix, PartitionedVector} import io.github.mandar2812.dynaml.analysis.{PartitionedVectorField, VectorField} import io.github.mandar2812.dynaml.probability.distributions._ import spire.implicits._ import spire.algebra.Field abstract class AbstractStudentsTRandVar[ T, V, Distr <: ContinuousDistr[T] with Moments[T, V] with HasErrorBars[T]](mu: Double) extends ContinuousRVWithDistr[T, Distr] case class StudentsTRV(mu: Double, mean: Double, sigma: Double) extends AbstractStudentsTRandVar[Double, Double, UnivariateStudentsT](mu) { override val underlyingDist = UnivariateStudentsT(mu, mean, sigma) } case class MultStudentsTRV( mu: Double, mean: DenseVector[Double], covariance : DenseMatrix[Double])( implicit ev: Field[DenseVector[Double]]) extends AbstractStudentsTRandVar[DenseVector[Double], DenseMatrix[Double], MultivariateStudentsT](mu) { override val underlyingDist: MultivariateStudentsT = MultivariateStudentsT(mu, mean, covariance) } object MultStudentsTRV { def apply(num_dim: Int)(mu: Double, mean: DenseVector[Double], covariance: DenseMatrix[Double]) = { assert( num_dim == mean.length, "Number of dimensions of vector space must match the number of elements of mean") implicit val ev = VectorField(num_dim) new MultStudentsTRV(mu, mean, covariance) } } case class MultStudentsTPRV( mu: Double, mean: PartitionedVector, covariance: PartitionedPSDMatrix)( implicit ev: Field[PartitionedVector]) extends AbstractStudentsTRandVar[PartitionedVector, PartitionedPSDMatrix, BlockedMultivariateStudentsT](mu) { override val underlyingDist: BlockedMultivariateStudentsT = BlockedMultivariateStudentsT(mu, mean, covariance) } object MultStudentsTPRV { def apply(num_dim: Long, nE: Int)(mu: Double, mean: PartitionedVector, covariance: PartitionedPSDMatrix) = { assert( num_dim == mean.rows, "Number of dimensions of vector space must match the number of elements of mean") implicit val ev = PartitionedVectorField(num_dim, nE) new MultStudentsTPRV(mu, mean, covariance) } } case class MatrixTRV( mu: Double, m: DenseMatrix[Double], u: DenseMatrix[Double], v: DenseMatrix[Double]) extends AbstractStudentsTRandVar[DenseMatrix[Double], (DenseMatrix[Double], DenseMatrix[Double]), MatrixT](mu) { override val underlyingDist = MatrixT(mu, m, v, u) }
Example 172
Source File: VectorIIDSigmoid.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.probability import breeze.linalg.{DenseMatrix, DenseVector, diag} import breeze.numerics.sigmoid override def hessian(y: DenseVector[Double], f: DenseVector[Double]): DenseMatrix[Double] = { diag(DenseVector((y.toArray zip f.toArray).map((couple) => { val pi = sigmoid(couple._1*couple._2) -1.0*pi*(1.0 - pi) }))) } override def gaussianExpectation(normalDistParams: (DenseVector[Double], DenseVector[Double])): DenseVector[Double] = { DenseVector((normalDistParams._1.toArray zip normalDistParams._2.toArray).map((couple) => { val gamma = math.sqrt(1.0 + (math.Pi*couple._2/8.0)) sigmoid(couple._1/gamma) })) } }
Example 173
Source File: BlockedMultivariateStudentsT.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.probability.distributions import breeze.linalg.DenseVector import breeze.numerics._ import math.Pi import breeze.stats.distributions._ import io.github.mandar2812.dynaml.algebra._ import io.github.mandar2812.dynaml.algebra.PartitionedMatrixOps._ import io.github.mandar2812.dynaml.algebra.PartitionedMatrixSolvers._ import io.github.mandar2812.dynaml.probability.RandomVariable import spire.implicits._ import scala.runtime.ScalaRunTime case class BlockedMultivariateStudentsT( mu: Double, mean: PartitionedVector, covariance: PartitionedPSDMatrix)(implicit rand: RandBasis = Rand) extends AbstractContinuousDistr[PartitionedVector] with Moments[PartitionedVector, PartitionedPSDMatrix] with HasErrorBars[PartitionedVector] { require(mu > 2.0, "Degrees of freedom must be greater than 2.0, for a multivariate t distribution to be defined") private val chisq = new ChiSquared(mu) def draw() = { val w = math.sqrt(mu/chisq.draw()) val nE: Int = if(mean.rowBlocks > 1L) mean(0L to 0L)._data.head._2.length else mean.rows.toInt val z: PartitionedVector = PartitionedVector.rand(mean.rows, nE, RandomVariable(new StudentsT(mu)))*w val m: PartitionedVector = root * z m + mean } private lazy val root: LowerTriPartitionedMatrix = bcholesky(covariance) override def toString() = ScalaRunTime._toString(this) override def unnormalizedLogPdf(t: PartitionedVector) = { val centered: PartitionedVector = t - mean val z: PartitionedVector = root \ centered val slv: PartitionedVector = root.t \ z -0.5*(mu+mean.rows)*log(1.0 + ((slv dot centered) / mu)) } override lazy val logNormalizer = { // determinant of the cholesky decomp is the sqrt of the determinant of the cov matrix // this is the log det of the cholesky decomp val det = bsum(blog(bdiag(root))) ((mean.rows/2) * (log(mu) + log(Pi))) + 0.5*det + lgamma(mu/2.0) - lgamma((mu+mean.rows)/2.0) } def variance = new PartitionedPSDMatrix( covariance._underlyingdata.map(c => (c._1, c._2*(mu/(mu-2.0)))), covariance.rows, covariance.cols, covariance.rowBlocks, covariance.colBlocks) def mode: PartitionedVector = mean //TODO: Check and correct calculation of entropy for Mult Students T lazy val entropy = { bsum(blog(bdiag(root))) + (mean.rows/2.0)*log(mu*Pi) + lbeta(mean.rows/2.0, mu/2.0) - lgamma(mean.rows/2.0) + (digamma((mu+mean.rows)/2.0) - digamma(mu/2.0))*(mu+mean.rows)/2.0 } override def confidenceInterval(s: Double) = { val signFlag = if(s < 0) -1.0 else 1.0 val nE: Int = if(mean.rowBlocks > 1L) mean(0L to 0L)._data.head._2.length else mean.rows.toInt val ones = PartitionedVector.ones(mean.rows, nE) val multiplier = signFlag*s val bar: PartitionedVector = root*(ones*(multiplier*math.sqrt(mu/(mu-2.0)))) (mean - bar, mean + bar) } }
Example 174
Source File: MixtureDistribution.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.probability.distributions import breeze.linalg.{DenseVector, sum} import breeze.stats.distributions.{ContinuousDistr, Moments, Multinomial} import spire.algebra.VectorSpace class MixtureWithConfBars[I, V]( distributions: Seq[ContinuousDistr[I] with Moments[I, V] with HasErrorBars[I]], probabilities: Multinomial[DenseVector[Double], Int])( implicit vI: VectorSpace[I, Double]) extends MixtureDistribution[I](distributions, probabilities) with HasErrorBars[I] { private val weightsArr = probabilities.params.toArray override def confidenceInterval(s: Double) = distributions.zip(weightsArr).map(c => { val (lower, upper) = c._1.confidenceInterval(s) (vI.timesr(lower, c._2), vI.timesr(upper, c._2)) }).reduce((a,b) => (vI.plus(a._1, b._1), vI.plus(a._2, b._2)) ) def mean = distributions.zip(weightsArr) .map(c => vI.timesr(c._1.mean, c._2)) .reduce((a,b) => vI.plus(a,b)) } object MixtureWithConfBars { def apply[I, V]( distributions: Seq[ContinuousDistr[I] with Moments[I, V] with HasErrorBars[I]], weights: DenseVector[Double])( implicit vI: VectorSpace[I, Double]): MixtureWithConfBars[I, V] = new MixtureWithConfBars(distributions, new Multinomial[DenseVector[Double], Int](weights)) }
Example 175
Source File: SparkBlockedVector.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.algebra import breeze.linalg.{DenseVector, NumericOps} import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.collection.immutable.NumericRange def vertcat(vectors: SparkBlockedVector*): SparkBlockedVector = { //sanity check assert(vectors.map(_.colBlocks).distinct.length == 1, "In case of vertical concatenation of matrices their columns sizes must be equal") val sizes = vectors.map(_.rowBlocks) new SparkBlockedVector(vectors.zipWithIndex.map(couple => { val offset = sizes.slice(0, couple._2).sum couple._1._data.map(c => (c._1+offset, c._2)) }).reduceLeft((a,b) => a.union(b))) } }
Example 176
Source File: MinMaxAccumulator.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.utils import breeze.linalg.DenseVector import org.apache.spark.AccumulatorParam object MinMaxAccumulator extends AccumulatorParam[DenseVector[Double]] { def zero(initialValue: DenseVector[Double]): DenseVector[Double] = { DenseVector(Double.MaxValue, Double.MinValue) } def addInPlace(v1: DenseVector[Double], v2: DenseVector[Double]): DenseVector[Double] = { v1(0) = math.min(v1(0), v2(0)) v1(1) = math.max(v1(1), v2(1)) v1 } }
Example 177
Source File: PCAScaler.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.utils import breeze.linalg.eig.Eig import breeze.linalg.{DenseMatrix, DenseVector, eig} import io.github.mandar2812.dynaml.pipes.{ReversibleScaler, Scaler} case class PCAScaler( center: DenseVector[Double], eigenvalues: DenseVector[Double], eigenvectors: DenseMatrix[Double]) extends ReversibleScaler[DenseVector[Double]] { self => override val i = Scaler((data: DenseVector[Double]) => (eigenvectors*data)+center) override def run(data: DenseVector[Double]) = eigenvectors.t*(data-center) def apply(r: Range): CompressedPCAScaler = CompressedPCAScaler( r, self.center, self.eigenvalues, self.eigenvectors) } case class CompressedPCAScaler( r: Range, center: DenseVector[Double], eigenvalues: DenseVector[Double], eigenvectors: DenseMatrix[Double] ) extends Scaler[DenseVector[Double]] { override def run(data: DenseVector[Double]) = { val projections = eigenvectors.t*(data-center) projections(r) } }
Example 178
Source File: MetaModel.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.ensemble import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.modelpipe.ModelPipe import io.github.mandar2812.dynaml.models.Model import io.github.mandar2812.dynaml.models.gp.GPRegression import io.github.mandar2812.dynaml.models.neuralnets.FeedForwardNetwork abstract class MetaModel[ D, D1, BaseModel <: Model[D1, DenseVector[Double], Double], Pipe <: ModelPipe[D, D1, DenseVector[Double], Double, BaseModel] ](num: Long, data: D, networks: Pipe*) extends Model[D, DenseVector[Double], Double] { override protected val g = data val baseNetworks: List[BaseModel] = networks.toList.map(_(g)) }
Example 179
Source File: NeuralLayer.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.neuralnets import breeze.linalg.{DenseMatrix, DenseVector} import io.github.mandar2812.dynaml.pipes.{DataPipe, MetaPipe} class NeuralLayerFactory[P, I, J]( metaLocalField: MetaPipe[P, I, J], val activationFunc: Activation[J]) extends DataPipe[P, NeuralLayer[P, I, J]] { override def run(params: P) = NeuralLayer(metaLocalField, activationFunc)(params) } class Vec2VecLayerFactory(act: Activation[DenseVector[Double]])(inDim: Int, outDim: Int) extends NeuralLayerFactory[ (DenseMatrix[Double], DenseVector[Double]), DenseVector[Double], DenseVector[Double]]( MetaPipe((p: (DenseMatrix[Double], DenseVector[Double])) => (x: DenseVector[Double]) => p._1*x + p._2), act) { override def run(params: (DenseMatrix[Double], DenseVector[Double])) = { require( params._1.cols == inDim && params._1.rows == outDim && params._2.length == outDim, "Weight matrix and bias vector sizes must be consistent for a Vector to Vector layer") super.run(params) } }
Example 180
Source File: FeedForwardNetwork.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.neuralnets import breeze.linalg.DenseVector import com.tinkerpop.blueprints.Graph import com.tinkerpop.frames.FramedGraph import io.github.mandar2812.dynaml.graph.FFNeuralGraph import io.github.mandar2812.dynaml.optimization.BackPropagation import io.github.mandar2812.dynaml.pipes.DataPipe def test(d: D): Stream[(DenseVector[Double], DenseVector[Double])] = { val (procInputs, _) = dataAsStream(d) .map(c => (c._1.toArray.toList.map(i => List(i)), c._2.toArray.toList.map(i => List(i)))) .reduce((c1,c2) => (c1._1.zip(c2._1).map(c => c._1++c._2), c1._2.zip(c2._2).map(c => c._1++c._2))) val predictedOutputBuffer = params.predictBatch(procInputs) //dataAsStream(d).map(rec => (feedForward(rec._1), rec._2)) dataAsStream(d).map(_._2).zipWithIndex.map(c => (DenseVector.tabulate[Double](outputDimensions)(dim => predictedOutputBuffer(dim)(c._2)), c._1) ) } }
Example 181
Source File: CommitteeNetwork.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.neuralnets import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.graph.FFNeuralGraph import io.github.mandar2812.dynaml.models.LinearModel import io.github.mandar2812.dynaml.optimization.{BackPropagation, CommitteeModelSolver, RegularizedOptimizer} import io.github.mandar2812.dynaml.pipes.DataPipe def test(d: D): Stream[(DenseVector[Double], DenseVector[Double])] = { val (procInputs, _) = dataAsStream(d) .map(c => (c._1.toArray.toList.map(i => List(i)), c._2.toArray.toList.map(i => List(i)))) .reduce((c1,c2) => (c1._1.zip(c2._1).map(c => c._1++c._2), c1._2.zip(c2._2).map(c => c._1++c._2))) val committeepredictions = baseNetworks.map(network => { network.predictBatch(procInputs) }) dataAsStream(d).map(_._2).zipWithIndex.map(c => { val votes = DenseVector.tabulate[Double](baseNetworks.length)(Ndim => committeepredictions(Ndim)(0)(c._2)) val prediction: Double = params dot votes (DenseVector(prediction), c._1) }) } }
Example 182
Source File: LSSVMCommittee.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.svm import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.modelpipe.DLSSVMPipe import io.github.mandar2812.dynaml.models.ensemble.CommitteeModel import io.github.mandar2812.dynaml.optimization._ import org.apache.spark.rdd.RDD class LSSVMCommittee(num: Long, data: RDD[(DenseVector[Double], Double)], pipes: DLSSVMPipe[RDD[(DenseVector[Double], Double)]]*) extends CommitteeModel[RDD[(DenseVector[Double], Double)], Stream[(DenseVector[Double], Double)], DLSSVM, DLSSVMPipe[RDD[(DenseVector[Double], Double)]]] (num, data, pipes:_*){ override protected val optimizer: RegularizedOptimizer[ DenseVector[Double], DenseVector[Double], Double, RDD[(DenseVector[Double], Double)]] = new RDDCommitteeSolver var modelTuners: List[ModelTuner[DLSSVM, DLSSVM]] = baseNetworks.map(m => new GridSearch[DLSSVM](m).setGridSize(10).setStepSize(0.1)) override def learn(): Unit = { //First tune and learn the base SVM models (baseNetworks zip modelTuners).foreach(modelCouple => { val (_, conf) = modelCouple._2.optimize(modelCouple._1.getState, Map()) modelCouple._1.setState(conf) modelCouple._1.learn() }) //Now learn the committee weights val fMap = featureMap params = optimizer.optimize(num_points, g.map(patternCouple => (fMap(patternCouple._1), patternCouple._2)), initParams()) } }
Example 183
Source File: StudentTProcessMixture.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.stp import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.algebra.{PartitionedPSDMatrix, PartitionedVector} import io.github.mandar2812.dynaml.analysis.InnerProductPV import io.github.mandar2812.dynaml.models.GenContinuousMixtureModel import io.github.mandar2812.dynaml.probability.MultStudentsTPRV import io.github.mandar2812.dynaml.probability.distributions.BlockedMultivariateStudentsT import spire.algebra.VectorSpace import scala.reflect.ClassTag class StudentTProcessMixture[T, I: ClassTag]( override val component_processes: Seq[AbstractSTPRegressionModel[T, I]], override val weights: DenseVector[Double]) extends GenContinuousMixtureModel[ T, I, Double, PartitionedVector, PartitionedPSDMatrix, BlockedMultivariateStudentsT, MultStudentsTPRV, AbstractSTPRegressionModel[T, I]]( component_processes, weights) { protected val blockSize: Int = component_processes.head._blockSize override protected def toStream(y: PartitionedVector): Stream[Double] = y.toStream override protected def getVectorSpace(num_dim: Int): VectorSpace[PartitionedVector, Double] = InnerProductPV(num_dim, blockSize) }
Example 184
Source File: StudentTRegression.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.stp import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.evaluation.RegressionMetrics import io.github.mandar2812.dynaml.kernels.{DiracKernel, LocalScalarKernel} import io.github.mandar2812.dynaml.pipes.{DataPipe, StreamDataPipe} override def energy(h: Map[String, Double], options: Map[String, String]): Double = validationSet.length match { case 0 => super.energy(h, options) case _ => // Calculate regression metrics on validation set // Return some function of kpi as energy setState(h) val resultsToScores = DataPipe( (res: Seq[(DenseVector[Double], Double, Double, Double, Double)]) => res.map(i => (i._3, i._2)).toStream) (resultsToScores > processTargets > scoresToEnergy) run this.test(validationSet) } }
Example 185
Source File: MVTMixture.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.stp import breeze.linalg.{DenseMatrix, DenseVector} import io.github.mandar2812.dynaml.analysis.MatrixVectorSpace import io.github.mandar2812.dynaml.models.GenContinuousMixtureModel import io.github.mandar2812.dynaml.probability.MatrixTRV import io.github.mandar2812.dynaml.probability.distributions.MatrixT import spire.algebra.VectorSpace import scala.reflect.ClassTag class MVTMixture[T, I: ClassTag]( override val component_processes: Seq[MVStudentsTModel[T, I]], override val weights: DenseVector[Double]) extends GenContinuousMixtureModel[ T, I, DenseVector[Double], DenseMatrix[Double], (DenseMatrix[Double], DenseMatrix[Double]), MatrixT, MatrixTRV, MVStudentsTModel[T, I]](component_processes, weights) { val num_outputs: Int = component_processes.head.num_outputs override protected def toStream(y: DenseMatrix[Double]): Stream[DenseVector[Double]] = (0 until y.rows).toStream.map(index => y(index,::).t) override protected def getVectorSpace(num_dim: Int): VectorSpace[DenseMatrix[Double], Double] = MatrixVectorSpace(num_dim, num_outputs) }
Example 186
Source File: SparkGLM.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.lm import breeze.linalg.{DenseMatrix, DenseVector} import io.github.mandar2812.dynaml.optimization.{RegularizedLSSolver, RegularizedOptimizer} import org.apache.spark.rdd.RDD override def prepareData(d: RDD[(DenseVector[Double], Double)]) = { val phi = featureMap val mapFunc = (xy: (DenseVector[Double], Double)) => { val phiX = DenseVector(phi(xy._1).toArray ++ Array(1.0)) val phiY = phiX*xy._2 (phiX*phiX.t, phiY) } d.mapPartitions((partition) => { Iterator(partition.map(mapFunc).reduce((a,b) => (a._1+b._1, a._2+b._2))) }).reduce((a,b) => (a._1+b._1, a._2+b._2)) } }
Example 187
Source File: SparkLogisticGLM.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.lm //Breeze Imports import breeze.linalg.DenseVector import breeze.numerics.sigmoid import breeze.stats.distributions.Gaussian import io.github.mandar2812.dynaml.optimization.ProbitGradient import org.apache.spark.mllib.linalg.Vectors //DynaML Imports import io.github.mandar2812.dynaml.optimization.{ GradientDescentSpark, LogisticGradient, RegularizedOptimizer, SquaredL2Updater} //Spark Imports import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.rdd.RDD class SparkProbitGLM( data: RDD[(DenseVector[Double], Double)], numPoints: Long, map: (DenseVector[Double]) => DenseVector[Double] = identity[DenseVector[Double]]) extends SparkLogisticGLM(data, numPoints, map) { private val standardGaussian = new Gaussian(0, 1.0) override val h: (Double) => Double = (x: Double) => standardGaussian.cdf(x) override protected val optimizer: RegularizedOptimizer[ DenseVector[Double], DenseVector[Double], Double, RDD[LabeledPoint]] = new GradientDescentSpark(new ProbitGradient, new SquaredL2Updater) }
Example 188
Source File: LogisticGLM.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.lm import breeze.linalg.DenseVector import breeze.numerics._ import breeze.stats.distributions.Gaussian import io.github.mandar2812.dynaml.optimization._ class ProbitGLM(data: Stream[(DenseVector[Double], Double)], numPoints: Int, map: (DenseVector[Double]) => DenseVector[Double] = identity[DenseVector[Double]]) extends LogisticGLM(data, numPoints, map) { private val standardGaussian = new Gaussian(0, 1.0) override val h = (x: Double) => standardGaussian.cdf(x) override protected val optimizer = new GradientDescent( new ProbitGradient, new SquaredL2Updater) }
Example 189
Source File: RegularizedGLM.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.lm import breeze.linalg.{DenseMatrix, DenseVector} import io.github.mandar2812.dynaml.optimization.{GloballyOptimizable, RegularizedLSSolver, RegularizedOptimizer} class RegularizedGLM(data: Stream[(DenseVector[Double], Double)], numPoints: Int, map: (DenseVector[Double]) => DenseVector[Double] = identity[DenseVector[Double]]) extends GeneralizedLinearModel[(DenseMatrix[Double], DenseVector[Double])](data, numPoints, map) with GloballyOptimizable { override val task = "regression" override protected val optimizer: RegularizedOptimizer[DenseVector[Double], DenseVector[Double], Double, (DenseMatrix[Double], DenseVector[Double])] = new RegularizedLSSolver override def prepareData(d: Stream[(DenseVector[Double], Double)]) = { val designMatrix = DenseMatrix.vertcat[Double]( d.map(point => DenseVector(featureMap(point._1).toArray ++ Array(1.0)).toDenseMatrix):_* ) val responseVector = DenseVector.vertcat( d.map(p => DenseVector(p._2)):_* ) (designMatrix.t*designMatrix, designMatrix.t*responseVector) } }
Example 190
Source File: GPNarModel.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.gp import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.kernels.LocalScalarKernel import io.github.mandar2812.dynaml.pipes.DataPipe import scala.annotation.tailrec import scala.collection.mutable.{MutableList => ML} class GPNarModel(order: Int, cov: LocalScalarKernel[DenseVector[Double]], nL: LocalScalarKernel[DenseVector[Double]], trainingdata: Seq[(DenseVector[Double], Double)], meanFunc: DataPipe[DenseVector[Double], Double] = DataPipe(_ => 0.0)) extends GPRegression(cov, nL, trainingdata, meanFunc) { val modelOrder = order def modelPredictedOutput(n: Int)(input: DenseVector[Double]): Seq[(Double, Double, Double)] = { assert(modelOrder == input.length, "Model order must be equal to dimension of input") @tailrec def predictAheadRec(num: Int, features: DenseVector[Double], predictions: ML[(Double, Double, Double)]): Seq[(Double, Double, Double)] = num match { case 0 => predictions.toSeq case _ => val pred: (DenseVector[Double], Double, Double, Double) = predictionWithErrorBars[Seq[DenseVector[Double]]](Seq(features), 2).head val newFeatures = DenseVector(features(1 until modelOrder).toArray ++ Array(pred._2)) predictAheadRec(num-1, newFeatures, predictions.+=:((pred._2, pred._3, pred._4))) } predictAheadRec(n, input, ML()) } }
Example 191
Source File: MOGPRegressionModel.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.gp import breeze.linalg.{DenseMatrix, DenseVector} import io.github.mandar2812.dynaml.kernels.LocalScalarKernel import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2} import io.github.mandar2812.dynaml.probability.distributions.MatrixNormal import org.apache.log4j.Logger override def dataAsSeq(data: Stream[(I, DenseVector[Double])]): Seq[((I, Int), Double)] = data.map((patternAndLabel) => patternAndLabel._2.mapPairs((i, label) => ((patternAndLabel._1, i), label) ).toArray.toSeq).reduceLeft((s1, s2) => s1 ++ s2) } class KroneckerMOGPModel[I]( covFunc: LocalScalarKernel[I], noiseCovFunc: LocalScalarKernel[I], coRegCov: LocalScalarKernel[Int], data: Stream[(I, DenseVector[Double])], num: Int, numOutputs: Int, meanFunc: DataPipe[(I, Int), Double] = DataPipe((_: (I, Int)) => 0.0)) extends MOGPRegressionModel[I](covFunc:*coRegCov, noiseCovFunc:* coRegCov, data, num, numOutputs, meanFunc) { val (covFPipe, noiseCovPipe, coRegCovPipe) = (covFunc.asPipe, noiseCovFunc.asPipe, coRegCov.asPipe) override def energy(h: Map[String, Double], options: Map[String, String]): Double = { setState(h) val (features, targets) = data.unzip val covMatrix: DenseMatrix[Double] = covFunc .buildKernelMatrix(features, features.length) .getKernelMatrix() val noiseMatrix: DenseMatrix[Double] = noiseCovFunc .buildKernelMatrix(features, features.length) .getKernelMatrix() val colCovMatrix = coRegCov .buildKernelMatrix(0 until noutputs, noutputs) .getKernelMatrix() val meanMat: DenseMatrix[Double] = DenseMatrix.vertcat( features.map(instance => DenseVector.tabulate[Double](noutputs)(o => mean((instance, o))).asDenseMatrix):_* ) val mvn = MatrixNormal(meanMat, covMatrix+noiseMatrix, colCovMatrix) -mvn.logPdf(DenseMatrix.vertcat(targets.map(_.asDenseMatrix):_*)) } }
Example 192
Source File: GPNarXModel.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.gp import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.kernels.LocalScalarKernel import io.github.mandar2812.dynaml.pipes.DataPipe class GPNarXModel( order: Int, ex: Int, cov: LocalScalarKernel[DenseVector[Double]], nL: LocalScalarKernel[DenseVector[Double]], trainingdata: Seq[(DenseVector[Double], Double)], meanFunc: DataPipe[DenseVector[Double], Double] = DataPipe(_ => 0.0)) extends GPRegression(cov, nL, trainingdata, meanFunc) { val modelOrder = order val exogenousInputs = ex }
Example 193
Source File: GPBasisFuncRegressionModel.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.models.gp import breeze.linalg.{DenseMatrix, DenseVector, cholesky, trace, inv} import breeze.numerics.{log, sqrt} import io.github.mandar2812.dynaml.algebra._ import io.github.mandar2812.dynaml.analysis._ import io.github.mandar2812.dynaml.algebra.PartitionedMatrixOps._ import io.github.mandar2812.dynaml.algebra.PartitionedMatrixSolvers._ import io.github.mandar2812.dynaml.kernels._ import io.github.mandar2812.dynaml.models.{ContinuousProcessModel, SecondOrderProcessModel} import io.github.mandar2812.dynaml.optimization.GloballyOptWithGrad import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2} import io.github.mandar2812.dynaml.probability.{MultGaussianPRV, MultGaussianRV} import org.apache.log4j.Logger import scala.reflect.ClassTag abstract class GPBasisFuncRegressionModel[T, I: ClassTag]( cov: LocalScalarKernel[I], n: LocalScalarKernel[I], data: T, num: Int, basisFunc: DataPipe[I, DenseVector[Double]], basis_param_prior: MultGaussianRV) extends AbstractGPRegressionModel[T, I]( cov, n, data, num) { val MultGaussianRV(b, covB) = basis_param_prior implicit val vf = VectorField(b.length) private lazy val lowB = cholesky(covB) private lazy val covBsolveb = lowB.t \ (lowB \ b) private lazy val h: PartitionedMatrix = PartitionedMatrix.horzcat(_blockSize)(trainingData.map(basisFunc(_)):_*) override val mean: DataPipe[I, Double] = basisFunc > DataPipe((h: DenseVector[Double]) => h.t * b) private val basisFeatureMap: DataPipe[I, DenseVector[Double]] = basisFunc > DataPipe((x: DenseVector[Double]) => lowB*x) val feature_map_cov = CovarianceFunction(basisFunc > DataPipe((x: DenseVector[Double]) => lowB*x)) override protected def getTrainKernelMatrix[U <: Seq[I]] = { SVMKernel.buildPartitionedKernelMatrix(trainingData, trainingData.length, _blockSize, _blockSize, (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y) + noiseModel.evaluate(x, y)} ) } override protected def getCrossKernelMatrix[U <: Seq[I]](test: U) = SVMKernel.crossPartitonedKernelMatrix( trainingData, test, _blockSize, _blockSize, (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y)} ) override protected def getTestKernelMatrix[U <: Seq[I]](test: U) = SVMKernel.buildPartitionedKernelMatrix( test, test.length.toLong, _blockSize, _blockSize, (x: I, y: I) => {covariance.evaluate(x, y) + feature_map_cov.evaluate(x, y)} ) }
Example 194
Source File: GLMPipe.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.modelpipe import breeze.linalg.{DenseMatrix, DenseVector} import io.github.mandar2812.dynaml.algebra.PartitionedPSDMatrix import io.github.mandar2812.dynaml.kernels.LocalScalarKernel import io.github.mandar2812.dynaml.models.lm.{GeneralizedLeastSquaresModel, GeneralizedLinearModel, GenericGLM, SparkGLM} import io.github.mandar2812.dynaml.pipes.{DataPipe, DataPipe2, DataPipe3} import org.apache.spark.rdd.RDD object SparkGLMPipe2 extends DataPipe2[ RDD[(DenseVector[Double], Double)], DataPipe[DenseVector[Double], DenseVector[Double]], GenericGLM[ RDD[(DenseVector[Double], Double)], (DenseMatrix[Double], DenseVector[Double])]] { override def run( data1: RDD[(DenseVector[Double], Double)], data2: DataPipe[DenseVector[Double], DenseVector[Double]]) = { val length = data1.count() new SparkGLM(data1, length, data2.run) } }
Example 195
Source File: DLSSVMPipe.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.modelpipe import io.github.mandar2812.dynaml.pipes.DataPipe import breeze.linalg.DenseVector import io.github.mandar2812.dynaml.kernels.LocalScalarKernel import io.github.mandar2812.dynaml.models.svm.DLSSVM class DLSSVMPipe[Source](pre: DataPipe[Source, Stream[(DenseVector[Double], Double)]], cov: LocalScalarKernel[DenseVector[Double]], task: String = "regression") extends ModelPipe[Source, Stream[(DenseVector[Double], Double)], DenseVector[Double], Double, DLSSVM] { override val preProcess = pre override def run(data: Source) = { val training = preProcess(data) new DLSSVM(training, training.length, cov, task) } }
Example 196
Source File: MixturePipe.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.modelpipe import breeze.linalg.{DenseMatrix, DenseVector} import breeze.stats.distributions.{ContinuousDistr, Moments} import io.github.mandar2812.dynaml.algebra.{PartitionedPSDMatrix, PartitionedVector} import io.github.mandar2812.dynaml.models.gp.AbstractGPRegressionModel import io.github.mandar2812.dynaml.models.stp.{AbstractSTPRegressionModel, MVStudentsTModel} import io.github.mandar2812.dynaml.models.{ ContinuousProcessModel, GenContinuousMixtureModel, SecondOrderProcessModel, StochasticProcessMixtureModel} import io.github.mandar2812.dynaml.optimization.GloballyOptimizable import io.github.mandar2812.dynaml.pipes.DataPipe2 import io.github.mandar2812.dynaml.probability.{ContinuousRVWithDistr, MatrixTRV, MultGaussianPRV, MultStudentsTPRV} import io.github.mandar2812.dynaml.probability.distributions.{ BlockedMultiVariateGaussian, BlockedMultivariateStudentsT, HasErrorBars, MatrixT} import scala.reflect.ClassTag abstract class MixturePipe[ T, I: ClassTag, Y, YDomain, YDomainVar, BaseDistr <: ContinuousDistr[YDomain] with Moments[YDomain, YDomainVar] with HasErrorBars[YDomain], W1 <: ContinuousRVWithDistr[YDomain, BaseDistr], BaseProcess <: ContinuousProcessModel[T, I, Y, W1] with SecondOrderProcessModel[T, I, Y, Double, DenseMatrix[Double], W1] with GloballyOptimizable] extends DataPipe2[Seq[BaseProcess], DenseVector[Double], GenContinuousMixtureModel[ T, I, Y, YDomain, YDomainVar, BaseDistr, W1, BaseProcess]] class GPMixturePipe[T, I: ClassTag] extends MixturePipe[T, I, Double, PartitionedVector, PartitionedPSDMatrix, BlockedMultiVariateGaussian, MultGaussianPRV, AbstractGPRegressionModel[T, I]] { override def run( models: Seq[AbstractGPRegressionModel[T, I]], weights: DenseVector[Double]) = StochasticProcessMixtureModel(models, weights) } class StudentTMixturePipe[T, I: ClassTag] extends MixturePipe[T, I, Double, PartitionedVector, PartitionedPSDMatrix, BlockedMultivariateStudentsT, MultStudentsTPRV, AbstractSTPRegressionModel[T, I]] { override def run( models: Seq[AbstractSTPRegressionModel[T, I]], weights: DenseVector[Double]) = StochasticProcessMixtureModel(models, weights) } class MVStudentsTMixturePipe[T, I: ClassTag] extends MixturePipe[ T, I, DenseVector[Double], DenseMatrix[Double], (DenseMatrix[Double], DenseMatrix[Double]), MatrixT, MatrixTRV, MVStudentsTModel[T, I]] { override def run( models: Seq[MVStudentsTModel[T, I]], weights: DenseVector[Double]) = StochasticProcessMixtureModel(models, weights) }
Example 197
Source File: RegularizedLSSolver.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg.{DenseMatrix, DenseVector} override def optimize(nPoints: Long, ParamOutEdges: (DenseMatrix[Double], DenseVector[Double]), initialP: DenseVector[Double]): DenseVector[Double] = { val (designMatrix,labels) = ParamOutEdges val smoother = DenseMatrix.tabulate[Double](initialP.length, initialP.length)((i,j) => { if(i != j) 0.0 else if(i < initialP.length-1) regParam else 1.0 }) //Construct matrix A and b block by block val A = designMatrix + smoother val b = labels A\b } }
Example 198
Source File: LaplacePosteriorMode.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg.{DenseMatrix, DenseVector, cholesky, inv} import breeze.numerics.sqrt import io.github.mandar2812.dynaml.DynaMLPipe._ import io.github.mandar2812.dynaml.pipes.DataPipe import io.github.mandar2812.dynaml.probability.Likelihood override def optimize(nPoints: Long, ParamOutEdges: (DenseMatrix[Double], DenseVector[Double]), initialP: DenseVector[Double]): DenseVector[Double] = LaplacePosteriorMode.run( nPoints, ParamOutEdges, this.likelihood, initialP, this.numIterations, identityPipe[(DenseMatrix[Double], DenseVector[Double])]) } object LaplacePosteriorMode { def run[T](nPoints: Long, data: T, likelihood: Likelihood[ DenseVector[Double], DenseVector[Double], DenseMatrix[Double], (DenseVector[Double], DenseVector[Double])], initialP: DenseVector[Double], numIterations: Int, transform: DataPipe[T, (DenseMatrix[Double], DenseVector[Double])]): DenseVector[Double] = { val (kMat, y) = transform(data) var mode = initialP var b = DenseVector.zeros[Double](y.length) var a = DenseVector.zeros[Double](y.length) val id = DenseMatrix.eye[Double](y.length) (1 to numIterations).foreach{ iter => val wMat = likelihood.hessian(y, mode) * -1.0 val wMatsq = sqrt(wMat) val L = cholesky(id + wMatsq*kMat*wMatsq) b = wMat*mode + likelihood.gradient(y, mode) val buff1 = wMatsq*kMat*b val buff2 = inv(L)*buff1 a = b - inv(wMatsq*L.t)*buff2 mode = kMat*a } mode } }
Example 199
Source File: QuasiNewtonOptimizer.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg.{DenseMatrix, DenseVector, inv} import io.github.mandar2812.dynaml.pipes.DataPipe import org.apache.log4j.Logger import spire.implicits._ override def optimize( nPoints: Long, ParamOutEdges: Stream[(DenseVector[Double], Double)], initialP: DenseVector[Double]): DenseVector[Double] = QuasiNewtonOptimizer.run( nPoints, this.regParam, this.numIterations, updater, gradient, this.stepSize, initialP, ParamOutEdges, DataPipe(identity[Stream[(DenseVector[Double], Double)]] _) ) } object QuasiNewtonOptimizer { private val logger = Logger.getLogger(this.getClass) def run[T]( nPoints: Long, regParam: Double, numIterations: Int, updater: HessianUpdater, gradient: Gradient, stepSize: Double, initial: DenseVector[Double], POutEdges: T, transform: DataPipe[T, Stream[(DenseVector[Double], Double)]], logging: Boolean = true, logging_rate: Int = 100): DenseVector[Double] = { var oldW: DenseVector[Double] = initial var newW = oldW val hessian = transform(POutEdges) .map(_._1) .map(x => DenseVector(x.toArray ++ Array(1.0))) .map(x => x*x.t) .reduce((x: DenseMatrix[Double], y: DenseMatrix[Double]) => x + y) var regInvHessian = inv(hessian + DenseMatrix.eye[Double](initial.length)*regParam) var oldCumGradient = DenseVector.zeros[Double](initial.length) println("Performing Quasi-Newton Optimization") cfor(1)(iter => iter < numIterations, iter => iter + 1)( iter => { val cumGradient: DenseVector[Double] = DenseVector.zeros(initial.length) var cumLoss: Double = 0.0 transform(POutEdges).foreach(ed => { val x = DenseVector(ed._1.toArray ++ Array(1.0)) val y = ed._2 cumLoss += gradient.compute(x, y, oldW, cumGradient) }) if(logging && iter % logging_rate == 0) RegularizedOptimizer.prettyPrint(iter, cumLoss/nPoints.toDouble) //Find the search direction p = inv(H)*grad(J) //perform update x_new = x + step*p val searchDirection = regInvHessian*cumGradient*(-1.0) newW = updater.compute(oldW, searchDirection, stepSize, iter, regParam)._1 regInvHessian = updater.hessianUpdate(regInvHessian, newW-oldW, cumGradient-oldCumGradient) oldW = newW oldCumGradient = cumGradient }) newW } }
Example 200
Source File: CommitteeModelSolver.scala From DynaML with Apache License 2.0 | 5 votes |
package io.github.mandar2812.dynaml.optimization import breeze.linalg.{DenseMatrix, DenseVector, inv} import org.apache.spark.rdd.RDD override def optimize(nPoints: Long, ParamOutEdges: RDD[(DenseVector[Double], Double)], initialP: DenseVector[Double]): DenseVector[Double] = { val sumMat = ParamOutEdges.map(couple => { val diff = couple._1 - DenseVector.fill[Double](couple._1.length)(couple._2) diff * diff.t }).reduce((mat1, mat2) => mat1+mat2) sumMat :/= nPoints.toDouble val ones = DenseVector.ones[Double](initialP.length) val invMat = inv(sumMat + DenseMatrix.eye[Double](initialP.length)*regParam) val ans: DenseVector[Double] = invMat*ones val Z: Double = ones dot ans ans/Z } }