breeze.stats.distributions.Gaussian Scala Example

Source File: generalDLM.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.models.lm

import java.io.{File, PrintWriter}
import java.lang.Math.sqrt

import breeze.stats.distributions.Gaussian

import scalaz.Scalaz._


object generalDLM {
  type Loglikelihood = Double
  type Observation = Double
  type State = Double
  type Time = Double

  case class Data(time: Time, observation: Observation, state: Option[State]) {
    override def toString = state match {
      case Some(x) => s"$time, $observation, $x"
      case None => s"$time, $observation"
    }
  }
  case class Parameters(a: Double, b: Double, l1: Double, l2: Double) {
    override def toString = s"$a, $b, $l1, $l2" 
  }

  def simulate(p: Parameters): Stream[Data] = {
    val stateSpace = unfold(Gaussian(p.l1, sqrt(p.l2)).draw)(x =>
      Some(x, x + Gaussian(0, sqrt(p.b)).draw)
    )
    stateSpace.zipWithIndex map { case (x, t) => 
      Data(t, x + Gaussian(0, sqrt(p.a)).draw, Some(x)) }
  }

  val runSimulateFirstOrder = {
    val p = Parameters(3.0, 0.5, 0.0, 10.0)
    // simulate 16 different realisations of 100 observations, representing 16 stations
    val observations = (1 to 16) map (id => (id, simulate(p).take(100).toVector))

    val pw = new PrintWriter(new File("data/firstOrderdlmRes.csv"))
    pw.write(
      observations.
        flatMap{ case (id, data) =>
          data map (x => id + ", " + x.toString)}.
        mkString("\n"))
    pw.close()
  }
}

Source File: StatisticalVolumeModelTests.scala From scalismo with Apache License 2.0

5 votes

package scalismo.statisticalmodel.experimental

import java.io.File
import java.net.URLDecoder

import breeze.linalg.DenseVector
import breeze.stats.distributions.Gaussian
import scalismo.ScalismoTestSuite
import scalismo.geometry.{_3D, Point}
import scalismo.io.StatismoIO
import scalismo.registration.{RigidTransformation, RigidTransformationSpace}
import scalismo.utils.Random

class StatisticalVolumeModelTests extends ScalismoTestSuite {

  implicit val random = Random(42)

  implicit def doubleToFloat(d: Double): Float = d.toFloat

  describe("A statistical Volume mesh model") {

    def compareModels(oldModel: StatisticalVolumeMeshModel, newModel: StatisticalVolumeMeshModel) {

      for (i <- 0 until 10) {
        val standardNormal = Gaussian(0, 1)(random.breezeRandBasis)
        val coeffsData = standardNormal.sample(oldModel.rank)
        val coeffs = DenseVector(coeffsData.toArray)
        val inst = oldModel.instance(coeffs)
        val instNew = newModel.instance(coeffs)
        inst.pointSet.points
          .zip(instNew.pointSet.points)
          .foreach {
            case (pt1, pt2) =>
              (pt1.toVector - pt2.toVector).norm should be(0.0 +- (0.1))
          }
      }
    }

    it("can be transformed forth and back and yield the same deformations") {
      val path = getClass.getResource("/TetraMeshModel2.h5").getPath
      val model = StatismoIO.readStatismoVolumeMeshModel(new File(URLDecoder.decode(path))).get

      val parameterVector = DenseVector[Double](1.5, 1.0, 3.5, Math.PI, -Math.PI / 2.0, -Math.PI)
      val rigidTransform = RigidTransformationSpace[_3D]().transformForParameters(parameterVector)
      val inverseTransform = rigidTransform.inverse.asInstanceOf[RigidTransformation[_3D]]
      val transformedModel = model.transform(rigidTransform)
      val newModel = transformedModel.transform(inverseTransform)
      compareModels(model, newModel)
    }

    it("can change the mean shape and still yield the same shape space") {

      val path = getClass.getResource("/TetraMeshModel2.h5").getPath
      val model = StatismoIO.readStatismoVolumeMeshModel(new File(URLDecoder.decode(path))).get

      val newMesh = model.sample

      def t(pt: Point[_3D]): Point[_3D] = {
        val ptId = model.referenceVolumeMesh.pointSet.findClosestPoint(pt).id
        newMesh.pointSet.point(ptId)
      }

      val newModel = model.changeReference(t)

      compareModels(model, newModel)
    }

  }
}

Source File: RandomTests.scala From scalismo with Apache License 2.0

5 votes

package scalismo.utils

import breeze.stats.distributions.{Gaussian, Uniform}
import scalismo.ScalismoTestSuite

class RandomTests extends ScalismoTestSuite {

  describe("A random source") {

    it("should yield a deterministic sequence when correctly seeded") {

      def randomNumbersSeeded() = {
        val r = Random(42)
        val standardNormal = Gaussian(0, 1)(r.breezeRandBasis)
        val uniform = Uniform(0, 1)(r.breezeRandBasis)
        (r.scalaRandom.nextInt, standardNormal.draw(), uniform.draw())
      }
      randomNumbersSeeded() should equal(randomNumbersSeeded())

    }

    it("should yield a random sequence when the predefined implicit is imported") {

      def randomNumbersNotSeeded() = {
        import scalismo.utils.Random.implicits._
        val r = implicitly[Random]
        val standardNormal = Gaussian(0, 1)(r.breezeRandBasis)
        val uniform = Uniform(0, 1)(r.breezeRandBasis)
        (r.scalaRandom.nextInt, standardNormal.draw(), uniform.draw())
      }
      randomNumbersNotSeeded() should not equal (randomNumbersNotSeeded())
    }
  }
}

Source File: ExpectedImprovement.scala From pravda-ml with Apache License 2.0

5 votes

package com.linkedin.photon.ml.hyperparameter.criteria

import breeze.linalg.DenseVector
import breeze.numerics.sqrt
import breeze.stats.distributions.Gaussian
import com.linkedin.photon.ml.hyperparameter.estimators.PredictionTransformation


  def apply(
      predictiveMeans: DenseVector[Double],
      predictiveVariances: DenseVector[Double]): DenseVector[Double] = {

    val std = sqrt(predictiveVariances)

    // PBO Eq. 1
    val gamma = - (predictiveMeans - bestEvaluation) / std

    // Eq. 2
    std :* ((gamma :* gamma.map(standardNormal.cdf)) + gamma.map(standardNormal.pdf))
  }
}

Source File: Arms.scala From banditsbook-scala with MIT License

5 votes

package com.github.everpeace.banditsbook.arm

import breeze.stats.distributions.{Bernoulli, Gaussian}

trait Arms {

  def BernoulliArm(p: Double): Arm[Double] = Bernoulli.distribution(p).map {
    case true  => 1.0d
    case false => 0.0d
  }

  def NormalArm(μ: Double, σ: Double): Arm[Double]  = Gaussian.distribution(μ -> σ)

  def AdversarialArm(start: Int, activeStart: Int, activeEnd: Int) = new Arm[Double] {
    @volatile var t = start
    override def draw(): Double = {
      t += 1
      t match {
        case _t if _t < activeStart => 0.0d
        case _t if activeStart <= _t && _t <= activeEnd => 1.0
        case _t => 0.0d
      }
    }
  }
}

Source File: RandNLATest.scala From spectrallda-tensorspark with Apache License 2.0

5 votes

package edu.uci.eecs.spectralLDA.utils

import breeze.linalg._
import breeze.linalg.qr.QR
import breeze.stats.distributions.{Gaussian, RandBasis, ThreadLocalRandomGenerator, Uniform}
import edu.uci.eecs.spectralLDA.testharness.Context
import org.apache.commons.math3.random.MersenneTwister
import org.apache.spark.SparkContext
import org.scalatest._


class RandNLATest extends FlatSpec with Matchers {

  private val sc: SparkContext = Context.getSparkContext

  "M2 sketching" should "be correct" in {
    val a1 = SparseVector(DenseVector.rand[Double](100).toArray)
    val a2 = SparseVector(DenseVector.rand[Double](100).toArray)
    val a3 = SparseVector(DenseVector.rand[Double](100).toArray)

    val docs = Seq((1000L, a1), (1001L, a2), (1002L, a3))
    val docsRDD = sc.parallelize(docs)

    // Random Gaussian matrix
    val g = DenseMatrix.rand[Double](100, 50, Gaussian(mu = 0.0, sigma = 1.0))

    val result = DenseMatrix.zeros[Double](100, 50)
    docsRDD
      .flatMap {
        case (id: Long, w: SparseVector[Double]) => RandNLA.accumulate_M_mul_S(g, w, sum(w))
      }
      .reduceByKey(_ + _)
      .collect
      .foreach {
        case (r: Int, a: DenseVector[Double]) => result(r, ::) := a.t
      }

    val m2 = docsRDD
      .map {
        case (id: Long, w: SparseVector[Double]) =>
          val l = sum(w)
          (w * w.t - diag(w)) / (l * (l - 1.0))
      }
      .reduce(_ + _)
    val expectedResult = m2 * g

    val diff: DenseMatrix[Double] = result - expectedResult
    val normDiff: Double = norm(norm(diff(::, *)).toDenseVector)
    normDiff should be <= 1e-8
  }

  "Randomised Power Iteration method" should "be approximately correct" in {
    implicit val randBasis: RandBasis =
      new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(234787)))

    val n = 100
    val k = 5

    val alpha: DenseVector[Double] = DenseVector[Double](25.0, 20.0, 15.0, 10.0, 5.0)
    val beta: DenseMatrix[Double] = DenseMatrix.rand(n, k, Uniform(0.0, 1.0))

    val norms = norm(beta(::, *)).toDenseVector
    for (j <- 0 until k) {
      beta(::, j) /= norms(j)
    }

    val a: DenseMatrix[Double] = beta * diag(alpha) * beta.t
    val sigma: DenseMatrix[Double] = DenseMatrix.rand(n, k, Gaussian(mu = 0.0, sigma = 1.0))
    val y = a * sigma
    val QR(q: DenseMatrix[Double], _) = qr.reduced(y)

    val (s: DenseVector[Double], u: DenseMatrix[Double]) = RandNLA.decomp2(a * q, q)

    val diff_a = u * diag(s) * u.t - a
    val norm_diff_a = norm(norm(diff_a(::, *)).toDenseVector)

    norm_diff_a should be <= 1e-8
  }
}

Source File: sbt-test-test.scala From scala-course with GNU General Public License v3.0

5 votes

import org.scalatest.funsuite.AnyFunSuite

// Here using FunSuite style - but other possibilities...

class SetSuite extends AnyFunSuite {

  test("An empty Set should have size 0") {
    assert(Set.empty.size == 0)
  }

  test("A Gaussian sample of length 10 should have length 10") {
    import breeze.stats.distributions.Gaussian
    val x = Gaussian(2.0,4.0).sample(10)
    assert(x.length === 10)
  }

  test("Cats map merge") {
    import cats.instances.all._
    import cats.syntax.semigroup._
    val m1 = Map("a"->1,"b"->2)
    val m2 = Map("b"->2,"c"->1)
    val m3 = m1 |+| m2
    val m4 = Map("b" -> 4, "c" -> 1, "a" -> 1)
    assert(m3 === m4)
  }

}


// eof

Source File: AutoEncoderSpec.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.models.neuralnets

import breeze.linalg.{DenseVector, sum}
import breeze.stats.distributions.{Gaussian, Uniform}
import io.github.mandar2812.dynaml.evaluation.MultiRegressionMetrics
import io.github.mandar2812.dynaml.pipes.DataPipe
import io.github.mandar2812.dynaml.probability.RandomVariable
import spire.implicits._

import org.scalatest.{FlatSpec, Matchers}

 ignore should "be able to learn a continuous, "+
    "invertible identity map x = g(h(x))" in {

    val uni = new Uniform(-math.Pi, math.Pi)
    val theta = RandomVariable(new Uniform(-math.Pi, math.Pi))
    val circleTransform = DataPipe((t: Double) => (math.cos(t), math.sin(t)))
    val rvOnCircle = theta > circleTransform
    //Create synthetic data set of x,y values

    val noise = new Gaussian(0.0, 0.02)

    val numPoints:Int = 4000
    val epsilon = 0.05

    val data = (1 to numPoints).map(_ => {
      val sample = rvOnCircle.draw
      val features = DenseVector(sample._1, sample._2)
      val augFeatures = DenseVector(
        math.pow(0.85*features(1), 2) + noise.draw,
        math.pow(0.45*features(0), 3) + noise.draw,
        math.pow(features(0)+0.85*features(1), 3) + noise.draw,
        math.pow(features(0)-0.5*features(1), 2) + noise.draw,
        math.pow(features(0)+features(1), 3) + noise.draw,
        math.pow(features(0)-features(1), 2) + noise.draw,
        math.pow(features(0)+0.4*features(1), 2) + noise.draw,
        math.pow(features(0)+0.5*features(1), 3) + noise.draw)

      augFeatures
    })

    val (trainingData, testData) = (data.take(3000), data.takeRight(1000))

    val enc = GenericAutoEncoder(List(8, 4, 4, 8), List(VectorTansig, VectorTansig, VectorTansig))

    //BackPropagation.rho = 0.5

    enc.optimizer.setRegParam(0.0001).setStepSize(0.1).setNumIterations(1000).momentum_(0.5)

    enc.learn(trainingData.toStream)

    val metrics = new MultiRegressionMetrics(
      testData.map(c => (enc.i(enc.f(c)), c)).toList,
      testData.length)

    println("Corr: "+metrics.corr)
    assert(sum(metrics.mae)/metrics.corr.length <= epsilon)

  }

}

Source File: NeuralNetSpec.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.models.neuralnets

import breeze.linalg.{DenseVector, sum}
import breeze.stats.distributions.{Gaussian, Uniform}
import io.github.mandar2812.dynaml.DynaMLPipe
import io.github.mandar2812.dynaml.evaluation.MultiRegressionMetrics
import io.github.mandar2812.dynaml.graph.FFNeuralGraph
import org.scalatest.{FlatSpec, Matchers}


class NeuralNetSpec extends FlatSpec with Matchers {

  "A feed-forward neural network" should "be able to learn non-linear functions "+
    "on a compact domain" in {
    val uni = new Uniform(0.0, 1.0)
    //Create synthetic data set of x,y values
    //x is sampled in unit hypercube, y = w.x + noise
    val noise = new Gaussian(0.0, 0.002)
    val uniH = new Uniform(0.0, 1.0)


    val numPoints:Int = 5000

    val data = (1 to numPoints).map(_ => {
      val features = DenseVector.tabulate[Double](4)(_ => uniH.draw)

      val (x,y,u,v) = (features(0), features(1), features(2), features(3))

      val target = DenseVector(
        1.0 + x*x + y*y*y + v*u*v + v*u + noise.draw,
        1.0 + x*u + u*y*y + v*v*v + u*u*u + noise.draw)

      (features, target)
    })

    val (trainingData, testData) = (data.take(4000), data.takeRight(1000))

    val epsilon = 0.85

    val model = new FeedForwardNetwork[Stream[(DenseVector[Double], DenseVector[Double])]](trainingData.toStream, FFNeuralGraph(4,2,0,
            List("logsig", "linear"),
            List(10), biasFlag = true))(DynaMLPipe.identityPipe[Stream[(DenseVector[Double], DenseVector[Double])]])

    model.setLearningRate(1.0)
      .setRegParam(0.01)
      .setMomentum(0.8)
      .setMaxIterations(150)
      .learn()

    val res = model.test(testData.toStream)

    val metrics = new MultiRegressionMetrics(res.toList, res.length)
    //println(metrics.Rsq)
    assert(sum(metrics.corr)/metrics.Rsq.length >= epsilon)
  }
}

Source File: mcmc.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.optimization

import java.io.{File, PrintWriter}
import breeze.numerics.log
import breeze.stats.distributions.{Gaussian, Uniform}
import io.github.mandar2812.dynaml.models.lm.KFilter
import io.github.mandar2812.dynaml.models.lm.generalDLM._
import KFilter._
import scalaz.Scalaz._

object mcmc {
  case class MetropolisState(params: Parameters, accepted: Int, ll: Loglikelihood)

    
  def metropolisIters(
    initParams: Parameters,
    likelihood: Parameters => Loglikelihood,
    perturb: Parameters => Parameters): Stream[MetropolisState] = {

    val initState = MetropolisState(initParams, 0, likelihood(initParams))
    unfold(initState)(metropolisStep(likelihood, perturb))
  }

  def main(args: Array[String]): Unit = {
    val n = 10000
    val p = Parameters(3.0, 0.5, 0.0, 10.0)
    val observations = simulate(p).take(100).toVector

    val iters = metropolisIters(p, filterll(observations), perturb(0.1)).take(n)
    println(s"Accepted: ${iters.last.accepted.toDouble/n}")

    // write the parameters to file
    val pw = new PrintWriter(new File("data/mcmcOutRes.csv"))
    pw.write(iters.map(_.params).mkString("\n"))
    pw.close()
  }
}

Source File: ApproximatePCA.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import breeze.numerics._
import breeze.stats._
import breeze.stats.distributions.{Gaussian, ThreadLocalRandomGenerator, RandBasis}
import com.github.fommil.netlib.LAPACK._
import edu.berkeley.cs.amplab.mlmatrix.util.QRUtils
import org.apache.commons.math3.random.MersenneTwister
import org.apache.spark.rdd.RDD
import org.netlib.util.intW
import keystoneml.pipelines.Logging
import keystoneml.workflow.Estimator


  def approximateQ(A: DenseMatrix[Double], l: Int, q: Int, seed: Int = 0): DenseMatrix[Double] = {
    val d = A.cols

    val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed)))
    val omega = DenseMatrix.rand(d, l, Gaussian(0,1)(randBasis)) //cpu: d*l, mem: d*l
    val y0 = A*omega //cpu: n*d*l, mem: n*l

    var Q = QRUtils.qrQR(y0)._1 //cpu: n*l**2

    for (i <- 1 to q) {
      val YHat = Q.t * A //cpu: l*n*d, mem: l*d
      val Qh = QRUtils.qrQR(YHat.t)._1 //cpu: d*l^2, mem: d*l

      val Yj = A * Qh //cpu: n*d*l, mem: n*l
      Q = QRUtils.qrQR(Yj)._1 //cpu:  n*l^2, mem: n*l
    }

    Q
  }
}

Source File: LogisticGLM.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.models.lm

import breeze.linalg.DenseVector
import breeze.numerics._
import breeze.stats.distributions.Gaussian
import io.github.mandar2812.dynaml.optimization._


class ProbitGLM(data: Stream[(DenseVector[Double], Double)],
                numPoints: Int,
                map: (DenseVector[Double]) => DenseVector[Double] =
                identity[DenseVector[Double]])
  extends LogisticGLM(data, numPoints, map) {

  private val standardGaussian = new Gaussian(0, 1.0)

  override val h = (x: Double) =>
    standardGaussian.cdf(x)

  override protected val optimizer =
    new GradientDescent(
      new ProbitGradient,
      new SquaredL2Updater)

}

Source File: SparkLogisticGLM.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.models.lm

//Breeze Imports
import breeze.linalg.DenseVector
import breeze.numerics.sigmoid
import breeze.stats.distributions.Gaussian
import io.github.mandar2812.dynaml.optimization.ProbitGradient
import org.apache.spark.mllib.linalg.Vectors
//DynaML Imports
import io.github.mandar2812.dynaml.optimization.{
GradientDescentSpark, LogisticGradient,
RegularizedOptimizer, SquaredL2Updater}
//Spark Imports
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.rdd.RDD


class SparkProbitGLM(
  data: RDD[(DenseVector[Double], Double)], numPoints: Long,
  map: (DenseVector[Double]) => DenseVector[Double] =
  identity[DenseVector[Double]]) extends SparkLogisticGLM(data, numPoints, map) {

  private val standardGaussian = new Gaussian(0, 1.0)

  override val h: (Double) => Double = (x: Double) => standardGaussian.cdf(x)

  override protected val optimizer: RegularizedOptimizer[
    DenseVector[Double], DenseVector[Double],
    Double, RDD[LabeledPoint]] = new GradientDescentSpark(new ProbitGradient, new SquaredL2Updater)

}

Source File: VectorIIDProbit.scala From DynaML with Apache License 2.0

5 votes

package io.github.mandar2812.dynaml.probability

import breeze.linalg.{DenseMatrix, DenseVector, diag}
import breeze.stats.distributions.Gaussian


  override def hessian(y: DenseVector[Double],
                       f: DenseVector[Double]): DenseMatrix[Double] = {
    diag(DenseVector((y.toArray zip f.toArray).map((couple) => {
      val n = standardGaussian.pdf(couple._2)
      val product = couple._1*couple._2
      val l = h(product)
      -1.0*(n*n)/(l*l) - product*n/l
    })))
  }

  override def gaussianExpectation(normalDistParams: (DenseVector[Double],
    DenseVector[Double])): DenseVector[Double] = {
    DenseVector((normalDistParams._1.toArray zip normalDistParams._2.toArray).map((couple) => {
      val gamma = math.sqrt(1.0 + couple._2)
      standardGaussian.pdf(couple._1/gamma)
    }))
  }
}

Source File: Burden.scala From seqspark with Apache License 2.0

5 votes

package org.dizhang.seqspark.assoc

import breeze.linalg.DenseVector
import breeze.stats.distributions.{Gaussian, StudentsT}
import org.dizhang.seqspark.stat.HypoTest.{NullModel => NM}
import org.dizhang.seqspark.stat.{Resampling, ScoreTest, WaldTest}
import org.dizhang.seqspark.util.General._

import scala.language.existentials


@SerialVersionUID(7727280001L)
trait Burden extends AssocMethod {
  def nullModel: NM
  def x: Encode.Fixed
  def result: AssocMethod.Result
}

object Burden {

  def apply(nullModel: NM,
            x: Encode.Coding): Burden with AssocMethod.AnalyticTest = {
    nullModel match {
      case nm: NM.Fitted =>
        AnalyticScoreTest(nm, x.asInstanceOf[Encode.Fixed])
      case _ =>
        AnalyticWaldTest(nullModel, x.asInstanceOf[Encode.Fixed])
    }
  }

  def apply(ref: Double, min: Int, max: Int,
            nullModel: NM.Fitted,
            x: Encode.Coding): ResamplingTest = {
    ResamplingTest(ref, min, max, nullModel, x.asInstanceOf[Encode.Fixed])
  }

  def getStatistic(nm: NM.Fitted, x: Encode.Coding): Double = {
    val st = ScoreTest(nm, x.asInstanceOf[Encode.Fixed].coding)
    st.score(0)/st.variance(0,0).sqrt
  }

  def getStatistic(nm: NM, x: DenseVector[Double]): Double = {
    val wt = WaldTest(nm, x)
    (wt.beta /:/ wt.std).apply(1)
  }

  @SerialVersionUID(7727280101L)
  final case class AnalyticScoreTest(nullModel: NM.Fitted,
                                     x: Encode.Fixed)
    extends Burden with AssocMethod.AnalyticTest
  {
    def geno = x.coding
    //val scoreTest = ScoreTest(nullModel, geno)
    val statistic = getStatistic(nullModel, x)
    val pValue = {
      val dis = new Gaussian(0.0, 1.0)
      Some(1.0 - dis.cdf(statistic))
    }

    def result: AssocMethod.BurdenAnalytic = {
      AssocMethod.BurdenAnalytic(x.vars, statistic, pValue, "test=score")
    }

  }
  case class AnalyticWaldTest(nullModel: NM,
                              x: Encode.Fixed) extends Burden with AssocMethod.AnalyticTest {
    def geno = x.coding
    private val wt = WaldTest(nullModel, x.coding)
    val statistic = getStatistic(nullModel, geno)
    val pValue = {
      val dis = new StudentsT(nullModel.dof - 1)
      Some(1.0 - dis.cdf(statistic))
    }
    def result = {
      AssocMethod.BurdenAnalytic(x.vars, statistic, pValue, s"test=wald;beta=${wt.beta(1)};betaStd=${wt.std(1)}")
    }
  }

  @SerialVersionUID(7727280201L)
  final case class ResamplingTest(refStatistic: Double,
                                  min: Int,
                                  max: Int,
                                  nullModel: NM.Fitted,
                                  x: Encode.Fixed)
    extends Burden with AssocMethod.ResamplingTest
  {
    def pCount = Resampling.Simple(refStatistic, min, max, nullModel, x, getStatistic).pCount
    def result: AssocMethod.BurdenResampling = {
      AssocMethod.BurdenResampling(x.vars, refStatistic, pCount)
    }
  }
}

Source File: SNV.scala From seqspark with Apache License 2.0

5 votes

package org.dizhang.seqspark.assoc

import breeze.stats.distributions.Gaussian
import org.dizhang.seqspark.stat.HypoTest.{NullModel => NM}
import org.dizhang.seqspark.stat.{Resampling, ScoreTest, WaldTest}
import org.dizhang.seqspark.util.General._

import scala.language.existentials


trait SNV extends AssocMethod {
  def nullModel: NM
  def x: Encode.Common
  def result: AssocMethod.Result
}

object SNV {
  def apply(nullModel: NM,
            x: Encode.Coding): SNV with AssocMethod.AnalyticTest = {
    nullModel match {
      case nm: NM.Fitted =>
        AnalyticScoreTest(nm, x.asInstanceOf[Encode.Common])
      case _ =>
        AnalyticWaldTest(nullModel, x.asInstanceOf[Encode.Common])
    }
  }

  def apply(ref: Double, min: Int, max: Int,
            nullModel: NM.Fitted,
            x: Encode.Coding): ResamplingTest = {
    ResamplingTest(ref, min, max, nullModel, x.asInstanceOf[Encode.Common])
  }

  def getStatistic(nm: NM.Fitted, x: Encode.Coding): Double = {
    val st = ScoreTest(nm, x.asInstanceOf[Encode.Common].coding)
    math.abs(st.score(0)/st.variance(0,0).sqrt)
  }

  @SerialVersionUID(7727280101L)
  final case class AnalyticScoreTest(nullModel: NM.Fitted,
                                     x: Encode.Common)
    extends SNV with AssocMethod.AnalyticTest
  {
    //val scoreTest = ScoreTest(nullModel, x.coding)
    val statistic = getStatistic(nullModel, x)
    val pValue = {
      val dis = new Gaussian(0.0, 1.0)
      Some((1.0 - dis.cdf(statistic)) * 2)
    }

    def result: AssocMethod.BurdenAnalytic = {
      AssocMethod.BurdenAnalytic(x.vars, statistic, pValue, "test=score")
    }
  }

  case class AnalyticWaldTest(nullModel: NM,
                              x: Encode.Common)
    extends SNV with AssocMethod.AnalyticTest
  {
    private val wt = WaldTest(nullModel, x.coding.toDenseVector)
    val statistic = wt.beta(1) / wt.std(1)
    val pVaue = Some(wt.pValue(oneSided = false).apply(1))
    def result = {
      AssocMethod.BurdenAnalytic(x.vars, statistic, pVaue, s"test=wald;beta=${wt.beta(1)};betaStd=${wt.std(1)}")
    }
  }

  @SerialVersionUID(7727280201L)
  final case class ResamplingTest(refStatistic: Double,
                                  min: Int,
                                  max: Int,
                                  nullModel: NM.Fitted,
                                  x: Encode.Common)
    extends SNV with AssocMethod.ResamplingTest
  {
    def pCount = Resampling.Simple(refStatistic, min, max, nullModel, x, getStatistic).pCount
    def result: AssocMethod.BurdenResampling = {
      AssocMethod.BurdenResampling(x.vars, refStatistic, pCount)
    }
  }
}

Source File: IntegratorTest.scala From spark-gp with Apache License 2.0

5 votes

package org.apache.spark.ml.commons.util

import breeze.numerics.{abs, sigmoid, sqrt}
import breeze.stats.distributions.{Gaussian, RandBasis}
import org.apache.commons.math3.random.MersenneTwister
import org.scalatest.FunSuite


class IntegratorTest extends FunSuite {

  test("testExpectedOfFunctionOfNormal") {
    val f = (x: Double) => sigmoid(x)
    val integrator = new Integrator(100)
    val mean = 0.5
    val variance = 3
    val sd = sqrt(variance)

    val testResult = integrator.expectedOfFunctionOfNormal(mean, variance, f)

    val gg = new Gaussian(mean, sd)(new RandBasis(new MersenneTwister()))
    val mcIters = 100000
    val values = gg.sample(mcIters).map(f)
    val mcResult = values.sum / mcIters
    val mcSD = sqrt(values.map(_ - mcResult).map(x => x * x).sum / mcIters) / sqrt(mcIters)
    assert(abs(mcResult - testResult) < 3 * mcSD)
  }

}

Source File: LinearDiscriminantAnalysisSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import breeze.linalg._
import breeze.stats.distributions.{Multinomial, Uniform, Gaussian}
import keystoneml.nodes.stats.StandardScaler
import org.apache.spark.SparkContext
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging
import keystoneml.utils.{TestUtils, MatrixUtils, Stats}
import keystoneml.workflow.PipelineContext

class LinearDiscriminantAnalysisSuite extends FunSuite with PipelineContext with Logging {
  test("Solve Linear Discriminant Analysis on the Iris Dataset") {
    sc = new SparkContext("local", "test")

    // Uses the Iris flower dataset
    val irisData = sc.parallelize(TestUtils.loadFile("iris.data"))
    val trainData = irisData.map(_.split(",").dropRight(1).map(_.toDouble)).map(new DenseVector(_))
    val features = new StandardScaler().fit(trainData).apply(trainData)
    val labels = irisData.map(_ match {
      case x if x.endsWith("Iris-setosa") => 1
      case x if x.endsWith("Iris-versicolor") => 2
      case x if x.endsWith("Iris-virginica") => 3
    })

    val lda = new LinearDiscriminantAnalysis(2)
    val out = lda.fit(features, labels)

    // Correct output taken from http://sebastianraschka.com/Articles/2014_python_lda.html#introduction
    logInfo(s"\n${out.x}")
    val majorVector = DenseVector(-0.1498, -0.1482, 0.8511, 0.4808)
    val minorVector = DenseVector(0.0095, 0.3272, -0.5748, 0.75)

    // Note that because eigenvectors can be reversed and still valid, we allow either direction
    assert(Stats.aboutEq(out.x(::, 0), majorVector, 1E-4) || Stats.aboutEq(out.x(::, 0), majorVector * -1.0, 1E-4))
    assert(Stats.aboutEq(out.x(::, 1), minorVector, 1E-4) || Stats.aboutEq(out.x(::, 1), minorVector * -1.0, 1E-4))
  }

  test("Check LDA output for a diagonal covariance") {
    sc = new SparkContext("local", "test")

    val matRows = 1000
    val matCols = 10
    val dimRed = 5

    // Generate a random Gaussian matrix.
    val gau = new Gaussian(0.0, 1.0)
    val randMatrix = new DenseMatrix(matRows, matCols, gau.sample(matRows*matCols).toArray)

    // Parallelize and estimate the LDA.
    val data = sc.parallelize(MatrixUtils.matrixToRowArray(randMatrix))
    val labels = data.map(x => Multinomial(DenseVector(0.2, 0.2, 0.2, 0.2, 0.2)).draw(): Int)
    val lda = new LinearDiscriminantAnalysis(dimRed).fit(data, labels)

    // Apply LDA to the input data.
    val redData = lda(data)
    val redMat = MatrixUtils.rowsToMatrix(redData.collect)

    // Compute its covariance.
    val redCov = cov(redMat)
    log.info(s"Covar\n$redCov")

    // The covariance of the dimensionality reduced matrix should be diagonal.
    for (
      x <- 0 until dimRed;
      y <- 0 until dimRed if x != y
    ) {
      assert(Stats.aboutEq(redCov(x,y), 0.0, 1e-6), s"LDA Matrix should be 0 off-diagonal. $x,$y = ${redCov(x,y)}")
    }
  }

}

Source File: EncEvalSuite.scala From keystone with Apache License 2.0

5 votes

package keystoneml.utils.external

import java.io.File

import breeze.linalg._
import breeze.stats.distributions.Gaussian
import keystoneml.nodes.learning.GaussianMixtureModel
import keystoneml.nodes.learning.external.GaussianMixtureModelEstimator
import org.scalatest.FunSuite
import keystoneml.pipelines.Logging
import keystoneml.utils.{Stats, TestUtils}

class EncEvalSuite extends FunSuite with Logging {

  test("Load SIFT Descriptors and compute Fisher Vector Features") {

    val siftDescriptor = csvread(new File(TestUtils.getTestResourceFileName("images/feats.csv")))

    val gmmMeans = TestUtils.getTestResourceFileName("images/voc_codebook/means.csv")
    val gmmVars = TestUtils.getTestResourceFileName("images/voc_codebook/variances.csv")
    val gmmWeights = TestUtils.getTestResourceFileName("images/voc_codebook/priors")

    val gmm = GaussianMixtureModel.load(gmmMeans, gmmVars, gmmWeights)

    val nCenters = gmm.means.cols
    val nDim = gmm.means.rows

    val extLib = new EncEval

    val fisherVector = extLib.calcAndGetFVs(
      gmm.means.toArray.map(_.toFloat),
      nCenters,
      nDim,
      gmm.variances.toArray.map(_.toFloat),
      gmm.weights.toArray.map(_.toFloat),
      siftDescriptor.toArray.map(_.toFloat))

    log.info(s"Fisher Vector is ${fisherVector.sum}")
    assert(Stats.aboutEq(fisherVector.sum, 40.109097, 1e-4), "SUM of Fisher Vectors must match expected sum.")

  }

  test("Compute a GMM from scala") {
    val nsamps = 10000

    // Generate two gaussians.
    val x = Gaussian(-1.0, 0.5).samples.take(nsamps).toArray
    val y = Gaussian(5.0, 1.0).samples.take(nsamps).toArray

    val z = shuffle(x ++ y).map(x => DenseVector(x))

    // Compute a 1-d GMM.
    val extLib = new EncEval
    val gmm = new GaussianMixtureModelEstimator(2).fit(z)

    logInfo(s"GMM means: ${gmm.means.toArray.mkString(",")}")
    logInfo(s"GMM vars: ${gmm.variances.toArray.mkString(",")}")
    logInfo(s"GMM weights: ${gmm.weights.toArray.mkString(",")}")

    // The results should be close to the distribution we set up.
    assert(Stats.aboutEq(min(gmm.means), -1.0, 1e-1), "Smallest mean should be close to -1.0")
    assert(Stats.aboutEq(max(gmm.means), 5.0, 1e-1), "Largest mean should be close to 1.0")
    assert(Stats.aboutEq(math.sqrt(min(gmm.variances)), 0.5, 1e-1), "Smallest SD should be close to 0.25")
    assert(Stats.aboutEq(math.sqrt(max(gmm.variances)), 1.0, 1e-1), "Largest SD should be close to 5.0")
  }
}

Source File: TestUtils.scala From keystone with Apache License 2.0

5 votes

package keystoneml.utils

import java.io.{FileReader, ByteArrayInputStream}
import breeze.linalg.DenseMatrix
import breeze.stats.distributions.{Gaussian, RandBasis, ThreadLocalRandomGenerator, Rand}
import edu.berkeley.cs.amplab.mlmatrix.RowPartitionedMatrix
import org.apache.commons.io.IOUtils
import org.apache.commons.math3.random.MersenneTwister
import org.apache.spark.SparkContext

import scala.io.Source
import scala.util.Random


  def genChannelMajorArrayVectorizedImage(x: Int, y: Int, z: Int): ChannelMajorArrayVectorizedImage = {
    ChannelMajorArrayVectorizedImage(genData(x, y, z), ImageMetadata(x,y,z))
  }

  def genRowColumnMajorByteArrayVectorizedImage(x: Int, y: Int, z: Int): RowColumnMajorByteArrayVectorizedImage = {
    RowColumnMajorByteArrayVectorizedImage(genData(x,y,z).map(_.toByte), ImageMetadata(x,y,z))
  }

  def createRandomMatrix(
      sc: SparkContext,
      numRows: Int,
      numCols: Int,
      numParts: Int,
      seed: Int = 42): RowPartitionedMatrix = {

    val rowsPerPart = numRows / numParts
    val matrixParts = sc.parallelize(1 to numParts, numParts).mapPartitionsWithIndex { (index, part) =>
      val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed+index)))
      Iterator(DenseMatrix.rand(rowsPerPart, numCols, Gaussian(0.0, 1.0)(randBasis)))
    }
    RowPartitionedMatrix.fromMatrix(matrixParts.cache())
  }

  def createLocalRandomMatrix(numRows: Int, numCols: Int, seed: Int = 42): DenseMatrix[Double] = {
    val randBasis: RandBasis = new RandBasis(new ThreadLocalRandomGenerator(new MersenneTwister(seed)))
    DenseMatrix.rand(numRows, numCols, Gaussian(0.0, 1.0)(randBasis))
  }
}

breeze.stats.distributions.Gaussian Scala Examples