org.apache.commons.math3.distribution.NormalDistribution Scala Example

Source File: KernelDensitySuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: SumEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
}

Source File: GroupedCountEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: StudentTCacher.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}


private[spark] class StudentTCacher(confidence: Double) {

  val NORMAL_APPROX_SAMPLE_SIZE = 100  // For samples bigger than this, use Gaussian approximation

  val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
  val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0)

  def get(sampleSize: Long): Double = {
    if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) {
      normalApprox
    } else {
      val size = sampleSize.toInt
      if (cache(size) < 0) {
        val tDist = new TDistribution(size - 1)
        cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2)
      }
      cache(size)
    }
  }
}

Source File: KernelDensitySuite.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {//核密度单样本
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
     //math.abs返回数的绝对值
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {//核密度多样本
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
     //math.abs返回数的绝对值
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MeanEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: CountEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.NormalDistribution


private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[Long, BoundedDouble] {

  var outputsMerged = 0
  var sum: Long = 0

  override def merge(outputId: Int, taskResult: Long) {
    outputsMerged += 1
    sum += taskResult
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(sum, 1.0, sum, sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val mean = (sum + 1 - p) / p
      val variance = (sum + 1) * (1 - p) / (p * p)
      val stdev = math.sqrt(variance)
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: SumEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
}

Source File: GroupedCountEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: StudentTCacher.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}


private[spark] class StudentTCacher(confidence: Double) {
  //对于大于此的样本,使用高斯近似
  val NORMAL_APPROX_SAMPLE_SIZE = 100  // For samples bigger than this, use Gaussian approximation

  val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
  val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0)

  def get(sampleSize: Long): Double = {
    if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) {
      normalApprox
    } else {
      val size = sampleSize.toInt
      if (cache(size) < 0) {
        val tDist = new TDistribution(size - 1)
        cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2)
      }
      cache(size)
    }
  }
}

Source File: CountEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.NormalDistribution


private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[Long, BoundedDouble] {

  var outputsMerged = 0
  var sum: Long = 0

  override def merge(outputId: Int, taskResult: Long) {
    outputsMerged += 1
    sum += taskResult
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(sum, 1.0, sum, sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val mean = (sum + 1 - p) / p
      val variance = (sum + 1) * (1 - p) / (p * p)
      val stdev = math.sqrt(variance)
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: MeanEvaluator.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: KernelDensitySuite.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MeanEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: CountEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.NormalDistribution


private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[Long, BoundedDouble] {

  var outputsMerged = 0
  var sum: Long = 0

  override def merge(outputId: Int, taskResult: Long) {
    outputsMerged += 1
    sum += taskResult
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(sum, 1.0, sum, sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val mean = (sum + 1 - p) / p
      val variance = (sum + 1) * (1 - p) / (p * p)
      val stdev = math.sqrt(variance)
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: SumEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
}

Source File: GroupedCountEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result.put(key, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(key, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: StudentTCacher.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}


private[spark] class StudentTCacher(confidence: Double) {

  val NORMAL_APPROX_SAMPLE_SIZE = 100  // For samples bigger than this, use Gaussian approximation

  val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
  val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0)

  def get(sampleSize: Long): Double = {
    if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) {
      normalApprox
    } else {
      val size = sampleSize.toInt
      if (cache(size) < 0) {
        val tDist = new TDistribution(size - 1)
        cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2)
      }
      cache(size)
    }
  }
}

Source File: ZScoreArbiter.scala From warp-core with MIT License

5 votes

package com.workday.warp.arbiters

import com.workday.telemetron.RequirementViolationException
import com.workday.warp.common.CoreWarpProperty._
import com.workday.warp.arbiters.traits.{ArbiterLike, CanReadHistory}
import com.workday.warp.persistence.TablesLike.TestExecutionRowLikeType
import com.workday.warp.persistence.Tables._
import com.workday.warp.utils.{AnnotationReader, Ballot}
import org.apache.commons.math3.distribution.NormalDistribution
import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation
import org.pmw.tinylog.Logger


  def vote[T: TestExecutionRowLikeType](responseTimes: Iterable[Double],
           ballot: Ballot,
           testExecution: T,
           minimumHistoricalData: Int): Option[Throwable] = {

    // we don't have enough historical data yet
    if (responseTimes.size < minimumHistoricalData) {
      Logger.warn(s"not enough historical measurements for ${ballot.testId}. (found ${responseTimes.size}, we require " +
        s"$minimumHistoricalData.) percentile threshold processing will not continue.")
      None
    }
    else {
      val measuredResponseTime: Double = testExecution.responseTime
      val mean: Double = responseTimes.sum / responseTimes.size
      // make sure standard deviation is strictly positive
      val stdDev: Double = math.max((new StandardDeviation).evaluate(responseTimes.toArray, mean), Double.MinPositiveValue)
      // convert cdf value to a percentile
      val percentile: Double = 100 * new NormalDistribution(mean, stdDev).cumulativeProbability(measuredResponseTime)

      val percentileRequirement: Double = AnnotationReader.getZScoreRequirement(ballot.testId)
      // check that the percentile according to cumulative distribution function is less than the requirement
      if (percentile <= percentileRequirement) None
      else Option(new RequirementViolationException(
        s"${ballot.testId} failed requirement imposed by ${this.getClass.getName}. expected response time (measured " +
        s"$measuredResponseTime sec) percentile <= $percentileRequirement, but was $percentile")
      )
    }
  }
}

Source File: SumEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
}

Source File: MeanEvaluator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: EI.scala From automl with Apache License 2.0

5 votes

package com.tencent.angel.spark.automl.tuner.acquisition

import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate
import org.apache.commons.logging.{Log, LogFactory}
import org.apache.commons.math3.distribution.NormalDistribution
import org.apache.spark.ml.linalg.{Vector, Vectors}


class EI(
          override val surrogate: Surrogate,
          val par: Double)
  extends Acquisition(surrogate) {

  val LOG: Log = LogFactory.getLog(classOf[Surrogate])

  override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = {
    val pred = surrogate.predict(X) // (mean, variance)

    // Use the best seen observation as incumbent
    val eta: Double = surrogate.curBest._2
    //println(s"best seen result: $eta")

    val m: Double = pred._1
    val s: Double = Math.sqrt(pred._2)
    //println(s"${X.toArray.mkString("(", ",", ")")}: mean[$m], variance[$s]")

    if (s == 0) {
      // if std is zero, we have observed x on all instances
      // using a RF, std should be never exactly 0.0
      (0.0, Vectors.dense(new Array[Double](X.size)))
    } else {
      val z = (pred._1 - eta - par) / s
      val norm: NormalDistribution = new NormalDistribution
      val cdf: Double = norm.cumulativeProbability(z)
      val pdf: Double = norm.density(z)
      val ei = s * (z * cdf + pdf)
      //println(s"EI of ${X.toArray.mkString("(", ",", ")")}: $ei, cur best: $eta, z: $z, cdf: $cdf, pdf: $pdf")
      (ei, Vectors.dense(new Array[Double](X.size)))
    }
  }
}

Source File: KullbackLeiblerTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.spark.extensibility

import java.lang.Math._

import org.scalaml.Logging
import org.scalatest.{FlatSpec, Matchers}
import org.apache.commons.math3.distribution.NormalDistribution
import org.scalaml.spark.{DatasetGenerator, SessionLifeCycle}


final class KullbackLeiblerTest extends FlatSpec with Matchers with Logging {
  import DatasetGenerator._
  import SessionLifeCycle._
  protected[this] val name = "Spark/Kullback Leibler"

  implicit private val sessionLifeCycle = new SessionLifeCycle {}
  implicit private val sparkSession = sessionLifeCycle.sparkSession

  final val normalGenerator = new NormalDistribution
  final val NUM_DATA_POINTS = 5000

  final val normalData = toDSPairDouble(NUM_DATA_POINTS)((n: Int) => {
    val x = n.toDouble * 0.001
    (x, normalGenerator.density(x))
  })

  it should s"$name divergence using Normal distribution mu=0" in {
    show(s"$name divergence using Normal distribution mu=0")

    val mu = 0.0
    normalKL(mu) should be(0.0 +- 0.001)
  }

  it should s"$name divergence using Normal distribution mu=1.0" in {
    show(s"$name divergence using Normal distribution mu=1.0")

    val mu = 1.0
    normalKL(mu) should be(0.01 +- 0.01)
  }

  it should s"$name divergence using Normal distribution mu=2.0" in {
    show(s"$name divergence using Normal distribution mu=2.0")

    val mu = 2.0
    normalKL(mu) should be(-4.7 +- 0.2)
  }

  it should s"$name divergence using Normal distribution mu=3.0" in {
    show("$name divergence using Normal distribution mu=3.0")

    val mu = 3.0
    normalKL(mu) should be(-180.0 +- 2.0)
  }

  private def normalKL(mu: Double): Double = {
    import Math._

    val Inv2PI = 1.0 / sqrt(2.0 * PI)
    val pdf = (x: Double) => { val z = x - mu; Inv2PI * exp(-z * z) }

    val kullbackLeibler = KullbackLeibler(s"Normal mu=$mu", pdf)
    val klValue = kullbackLeibler.kl(normalData).head
    show(s"klValue for $mu $klValue")
    klValue
  }

  it should s"$name divergence using constant distribution" in {
    import Math._
    val kullbackLeibler = KullbackLeibler("Constant", (x: Double) => 2.0)
    val klValue = kullbackLeibler.kl(normalData).head
    klValue should be(-7028.0 +- 10.0)
  }

  it should s"$name formula" in {
    type DataSeq = Seq[(Double, Double)]
    val Eps = 1e-12
    val LogEps = log(Eps)
    def exec(xy: DataSeq, pdf: Double => Double): Double = {
      -xy./:(0.0) {
        case (s, (x, y)) => {
          val px = pdf(x)
          val z = if (abs(y) < Eps) px / Eps else px / y
          val t = if (z < Eps) LogEps else log(z)
          s + px * t
        }
      }
    }
    val h: Seq[(Double, Double)] = Seq.tabulate(1000)(
      (n: Int) => (n.toDouble * 0.001, normalGenerator.density(n.toDouble * 0.001))
    )
    val Inv2PI = 1.0 / sqrt(2.0 * PI)
    exec(h.iterator.toSeq, (x: Double) => Inv2PI * exp(-x * x)) should be(37.7 +- 0.1)
  }
}

// ---------------------------  EOF -----------------------------------------------

Source File: BootstrapTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.sampling

import org.apache.commons.math3.distribution.{NormalDistribution, RealDistribution}
import org.scalaml.Logging
import org.scalatest.{FlatSpec, Matchers}
import scala.collection.mutable.ArrayBuffer
import scala.util.Random



final class BootstrapTest extends FlatSpec with Matchers with Logging {
  protected val name = "Bootstrap sampling replicates"
  final val NumReplicates1 = 256
  final val NumReplicates2 = 1024
  final val NumDataPoints = 10000

  private def bootstrapEvaluation(
    dist: RealDistribution,
    random: Random,
    coefs: (Double, Double),
    numReplicates: Int
  ): (Double, Double) = {

    val input = (0 until NumDataPoints)./:(new ArrayBuffer[(Double, Double)])(
      ( buf, _ ) => {
        val (a, b) = coefs
        val x = a * random.nextDouble - b
        buf += ( (x, dist.density(x)) )
      }
      ).toVector

      // Bootstrap for the statistisx
    val bootstrap = new Bootstrap(
      numReplicates,
      (x: Vector[Double]) => x.sum/x.length,
      input.map( _._2 ),
      (rLen: Int) => new Random( System.currentTimeMillis).nextInt(rLen)
    )
    (bootstrap.mean, bootstrap.error)
  }

  it should s"$name over a input with the distribution a*r + b $NumReplicates1 replicates" in {
    import Math._
    show(s"$name over a input with the distribution a*r + b $NumReplicates1 replicates")

    val (meanNormal, errorNormal) = bootstrapEvaluation(
      new NormalDistribution,
      new scala.util.Random,
      (5.0, 2.5),
      NumReplicates1
    )
    val expectedMean = 0.185
    show(s"$name meanNormal $meanNormal error $errorNormal")

    abs(expectedMean - meanNormal) < 0.05 should be (true)
    abs(errorNormal) < 0.05 should be (true)
  }

  it should s"$name over a input with the distribution a*r + b $NumReplicates2 replicates" in {
    import Math._
    show("$name over a input with the distribution a*r + b $NumReplicates2 replicates")

    val (meanNormal, errorNormal) = bootstrapEvaluation(
      new NormalDistribution,
      new scala.util.Random,
      (5.0, 2.5),
      NumReplicates2
    )
    val expectedMean = 0.185
    show(s"$name meanNormal $meanNormal error $errorNormal")

    abs(expectedMean - meanNormal) < 0.05 should be (true)
    abs(errorNormal) < 0.05 should be (true)
  }
}

// -----------------------------------  EOF -------------------------------------------

Source File: KernelDensitySuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MeanEvaluator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: MeanEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: CountEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.NormalDistribution


private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[Long, BoundedDouble] {

  var outputsMerged = 0
  var sum: Long = 0

  override def merge(outputId: Int, taskResult: Long) {
    outputsMerged += 1
    sum += taskResult
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(sum, 1.0, sum, sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val mean = (sum + 1 - p) / p
      val variance = (sum + 1) * (1 - p) / (p * p)
      val stdev = math.sqrt(variance)
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: KernelDensitySuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: GroupedCountEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T,Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: StudentTCacher.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}


private[spark] class StudentTCacher(confidence: Double) {

  val NORMAL_APPROX_SAMPLE_SIZE = 100  // For samples bigger than this, use Gaussian approximation

  val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
  val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0)

  def get(sampleSize: Long): Double = {
    if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) {
      normalApprox
    } else {
      val size = sampleSize.toInt
      if (cache(size) < 0) {
        val tDist = new TDistribution(size - 1)
        cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2)
      }
      cache(size)
    }
  }
}

Source File: EasyPlot.scala From spark-timeseries with Apache License 2.0

5 votes

package com.cloudera.sparkts

import org.apache.spark.mllib.linalg._
import breeze.plot._
import com.cloudera.sparkts.models.Autoregression

import org.apache.commons.math3.distribution.NormalDistribution

object EasyPlot {
  def ezplot(vec: Vector, style: Char): Figure = {
    val f = Figure()
    val p = f.subplot(0)
    p += plot((0 until vec.size).map(_.toDouble).toArray, vec.toArray, style = style)
    f
  }

  def ezplot(vec: Vector): Figure = ezplot(vec, '-')

  def ezplot(arr: Array[Double], style: Char): Figure = {
    val f = Figure()
    val p = f.subplot(0)
    p += plot(arr.indices.map(_.toDouble).toArray, arr, style = style)
    f
  }

    def ezplot(arr: Array[Double]): Figure = ezplot(arr, '-')

  def ezplot(vecs: Seq[Vector], style: Char): Figure = {
    val f = Figure()
    val p = f.subplot(0)
    val first = vecs.head
    vecs.foreach { vec =>
      p += plot((0 until first.size).map(_.toDouble).toArray, vec.toArray, style)
    }
    f
  }

  def ezplot(vecs: Seq[Vector]): Figure = ezplot(vecs, '-')

  
  def pacfPlot(data: Array[Double], maxLag: Int, conf: Double = 0.95): Figure = {
    // create AR(maxLag) model, retrieve coefficients and calculate confidence bound
    val model = Autoregression.fitModel(new DenseVector(data), maxLag)
    val pCorrs = model.coefficients // partial autocorrelations are the coefficients in AR(n) model
    val confVal = calcConfVal(conf, data.length)

    // Basic plot information
    val f = Figure()
    val p = f.subplot(0)
    p.title = "Partial autocorrelation function"
    p.xlabel = "Lag"
    p.ylabel = "Partial Autocorrelation"
    drawCorrPlot(pCorrs, confVal, p)
    f
  }

  private[sparkts] def calcConfVal(conf: Double, n: Int): Double = {
    val stdNormDist = new NormalDistribution(0, 1)
    val pVal = (1 - conf) / 2.0
    stdNormDist.inverseCumulativeProbability(1 - pVal) / Math.sqrt(n)
  }

  private[sparkts] def drawCorrPlot(corrs: Array[Double], confVal: Double, p: Plot): Unit = {
    // make decimal ticks visible
    p.setYAxisDecimalTickUnits()
    // plot correlations as vertical lines
    val verticalLines = corrs.zipWithIndex.map { case (corr, ix) =>
      (Array(ix.toDouble + 1, ix.toDouble + 1), Array(0, corr))
    }
    verticalLines.foreach { case (xs, ys) => p += plot(xs, ys) }
    // plot confidence intervals as horizontal lines
    val n = corrs.length
    Array(confVal, -1 * confVal).foreach { conf =>
      val xs = (0 to n).toArray.map(_.toDouble)
      val ys = Array.fill(n + 1)(conf)
      p += plot(xs, ys, '-', colorcode = "red")
    }
  }
}

Source File: StatsSuite.scala From hail with MIT License

5 votes

package is.hail.stats

import breeze.linalg.DenseMatrix
import is.hail.TestUtils._
import is.hail.testUtils._
import is.hail.utils._
import is.hail.variant._
import is.hail.{HailSuite, TestUtils}
import org.apache.commons.math3.distribution.{ChiSquaredDistribution, NormalDistribution}
import org.testng.annotations.Test

class StatsSuite extends HailSuite {

  @Test def chiSquaredTailTest() {
    val chiSq1 = new ChiSquaredDistribution(1)
    assert(D_==(chiSquaredTail(1d,1), 1 - chiSq1.cumulativeProbability(1d)))
    assert(D_==(chiSquaredTail(5.52341d,1), 1 - chiSq1.cumulativeProbability(5.52341d)))

    val chiSq2 = new ChiSquaredDistribution(2)
    assert(D_==(chiSquaredTail(1, 2), 1 - chiSq2.cumulativeProbability(1)))
    assert(D_==(chiSquaredTail(5.52341, 2), 1 - chiSq2.cumulativeProbability(5.52341)))

    val chiSq5 = new ChiSquaredDistribution(5.2)
    assert(D_==(chiSquaredTail(1, 5.2), 1 - chiSq5.cumulativeProbability(1)))
    assert(D_==(chiSquaredTail(5.52341, 5.2), 1 - chiSq5.cumulativeProbability(5.52341)))

    assert(D_==(inverseChiSquaredTail(.1, 1.0), chiSq1.inverseCumulativeProbability(1 - .1)))
    assert(D_==(inverseChiSquaredTail(.0001, 1.0), chiSq1.inverseCumulativeProbability(1 - .0001)))

    val a = List(.0000000001, .5, .9999999999, 1.0)
    a.foreach(p => assert(D_==(chiSquaredTail(inverseChiSquaredTail(p, 1.0), 1.0), p)))

    // compare with R
    assert(math.abs(chiSquaredTail(400, 1) - 5.507248e-89) < 1e-93)
    assert(D_==(inverseChiSquaredTail(5.507248e-89, 1), 400))
  }

  @Test def normalTest() {
    val normalDist = new NormalDistribution()
    assert(D_==(pnorm(1), normalDist.cumulativeProbability(1)))
    assert(math.abs(pnorm(-10) - normalDist.cumulativeProbability(-10)) < 1e-10)
    assert(D_==(qnorm(.6), normalDist.inverseCumulativeProbability(.6)))
    assert(D_==(qnorm(.0001), normalDist.inverseCumulativeProbability(.0001)))

    val a = List(0.0, .0000000001, .5, .9999999999, 1.0)
    assert(a.forall(p => D_==(qnorm(pnorm(qnorm(p))), qnorm(p))))

    // compare with R
    assert(math.abs(pnorm(-20) - 2.753624e-89) < 1e-93)
    assert(D_==(qnorm(2.753624e-89), -20))
  }

  @Test def poissonTest() {
    // compare with R
    assert(D_==(dpois(5, 10), 0.03783327))
    assert(qpois(0.3, 10) == 8)
    assert(qpois(0.3, 10, lowerTail = false, logP = false) == 12)
    assert(D_==(ppois(5, 10), 0.06708596))
    assert(D_==(ppois(5, 10, lowerTail = false, logP = false), 0.932914))

    assert(qpois(ppois(5, 10), 10) == 5)
    assert(qpois(ppois(5, 10, lowerTail = false, logP = false), 10, lowerTail = false, logP = false) == 5)

    assert(ppois(30, 1, lowerTail = false, logP = false) > 0)
  }

  @Test def betaTest() {
    val tol = 1e-5

    assert(D_==(dbeta(.2 , 1, 3), 1.92, tol))
    assert(D_==(dbeta(0.70, 2, 10), 0.001515591, tol))
    assert(D_==(dbeta(.4, 5, 3), 0.96768, tol))
    assert(D_==(dbeta(.3, 7, 2), 0.0285768, tol))
    assert(D_==(dbeta(.8, 2, 2), .96, tol))
    assert(D_==(dbeta(.1, 3, 6), 0.9920232, tol))
    assert(D_==(dbeta(.6, 3, 4), 1.3824, tol))
    assert(D_==(dbeta(.1, 1, 1), 1, tol))
    assert(D_==(dbeta(.2, 4, 7), 1.761608, tol))
    assert(D_==(dbeta(.2, 1, 2), 1.6, tol))

  }

  @Test def entropyTest() {
    assert(D_==(entropy("accctg"), 1.79248, tolerance = 1e-5))
    assert(D_==(entropy(Array(2, 3, 4, 5, 6, 6, 4)), 2.23593, tolerance = 1e-5))

  }

}

Source File: KernelDensitySuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MeanEvaluator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: KernelDensitySuite.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.SparkFunSuite
import org.apache.spark.mllib.util.MLlibTestSparkContext

class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext {
  test("kernel density single sample") {
    val rdd = sc.parallelize(Array(5.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal = new NormalDistribution(5.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr)
    assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr)
  }

  test("kernel density multiple samples") {
    val rdd = sc.parallelize(Array(5.0, 10.0))
    val evaluationPoints = Array(5.0, 6.0)
    val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints)
    val normal1 = new NormalDistribution(5.0, 3.0)
    val normal2 = new NormalDistribution(10.0, 3.0)
    val acceptableErr = 1e-6
    assert(math.abs(
      densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr)
    assert(math.abs(
      densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr)
  }
}

Source File: MeanEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

org.apache.commons.math3.distribution.NormalDistribution Scala Examples