org.apache.commons.math3.distribution.NormalDistribution Scala Examples
The following examples show how to use org.apache.commons.math3.distribution.NormalDistribution.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: KernelDensitySuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 2
Source File: SumEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 3
Source File: GroupedCountEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 4
Source File: StudentTCacher.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} private[spark] class StudentTCacher(confidence: Double) { val NORMAL_APPROX_SAMPLE_SIZE = 100 // For samples bigger than this, use Gaussian approximation val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0) def get(sampleSize: Long): Double = { if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) { normalApprox } else { val size = sampleSize.toInt if (cache(size) < 0) { val tDist = new TDistribution(size - 1) cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2) } cache(size) } } }
Example 5
Source File: KernelDensitySuite.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") {//核密度单样本 val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 //math.abs返回数的绝对值 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") {//核密度多样本 val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 //math.abs返回数的绝对值 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 6
Source File: MeanEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 7
Source File: CountEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.NormalDistribution private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[Long, BoundedDouble] { var outputsMerged = 0 var sum: Long = 0 override def merge(outputId: Int, taskResult: Long) { outputsMerged += 1 sum += taskResult } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(sum, 1.0, sum, sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 8
Source File: SumEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 9
Source File: GroupedCountEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 10
Source File: StudentTCacher.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} private[spark] class StudentTCacher(confidence: Double) { //对于大于此的样本,使用高斯近似 val NORMAL_APPROX_SAMPLE_SIZE = 100 // For samples bigger than this, use Gaussian approximation val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0) def get(sampleSize: Long): Double = { if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) { normalApprox } else { val size = sampleSize.toInt if (cache(size) < 0) { val tDist = new TDistribution(size - 1) cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2) } cache(size) } } }
Example 11
Source File: CountEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.NormalDistribution private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[Long, BoundedDouble] { var outputsMerged = 0 var sum: Long = 0 override def merge(outputId: Int, taskResult: Long) { outputsMerged += 1 sum += taskResult } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(sum, 1.0, sum, sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 12
Source File: MeanEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 13
Source File: KernelDensitySuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 14
Source File: MeanEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 15
Source File: CountEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.NormalDistribution private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[Long, BoundedDouble] { var outputsMerged = 0 var sum: Long = 0 override def merge(outputId: Int, taskResult: Long) { outputsMerged += 1 sum += taskResult } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(sum, 1.0, sum, sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 16
Source File: SumEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 17
Source File: GroupedCountEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result.put(key, new BoundedDouble(sum, 1.0, sum, sum)) } result.asScala } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result.put(key, new BoundedDouble(mean, confidence, low, high)) } result.asScala } } }
Example 18
Source File: StudentTCacher.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} private[spark] class StudentTCacher(confidence: Double) { val NORMAL_APPROX_SAMPLE_SIZE = 100 // For samples bigger than this, use Gaussian approximation val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0) def get(sampleSize: Long): Double = { if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) { normalApprox } else { val size = sampleSize.toInt if (cache(size) < 0) { val tDist = new TDistribution(size - 1) cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2) } cache(size) } } }
Example 19
Source File: ZScoreArbiter.scala From warp-core with MIT License | 5 votes |
package com.workday.warp.arbiters import com.workday.telemetron.RequirementViolationException import com.workday.warp.common.CoreWarpProperty._ import com.workday.warp.arbiters.traits.{ArbiterLike, CanReadHistory} import com.workday.warp.persistence.TablesLike.TestExecutionRowLikeType import com.workday.warp.persistence.Tables._ import com.workday.warp.utils.{AnnotationReader, Ballot} import org.apache.commons.math3.distribution.NormalDistribution import org.apache.commons.math3.stat.descriptive.moment.StandardDeviation import org.pmw.tinylog.Logger def vote[T: TestExecutionRowLikeType](responseTimes: Iterable[Double], ballot: Ballot, testExecution: T, minimumHistoricalData: Int): Option[Throwable] = { // we don't have enough historical data yet if (responseTimes.size < minimumHistoricalData) { Logger.warn(s"not enough historical measurements for ${ballot.testId}. (found ${responseTimes.size}, we require " + s"$minimumHistoricalData.) percentile threshold processing will not continue.") None } else { val measuredResponseTime: Double = testExecution.responseTime val mean: Double = responseTimes.sum / responseTimes.size // make sure standard deviation is strictly positive val stdDev: Double = math.max((new StandardDeviation).evaluate(responseTimes.toArray, mean), Double.MinPositiveValue) // convert cdf value to a percentile val percentile: Double = 100 * new NormalDistribution(mean, stdDev).cumulativeProbability(measuredResponseTime) val percentileRequirement: Double = AnnotationReader.getZScoreRequirement(ballot.testId) // check that the percentile according to cumulative distribution function is less than the requirement if (percentile <= percentileRequirement) None else Option(new RequirementViolationException( s"${ballot.testId} failed requirement imposed by ${this.getClass.getName}. expected response time (measured " + s"$measuredResponseTime sec) percentile <= $percentileRequirement, but was $percentile") ) } } }
Example 20
Source File: SumEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 21
Source File: MeanEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 22
package com.tencent.angel.spark.automl.tuner.acquisition import com.tencent.angel.spark.automl.tuner.surrogate.Surrogate import org.apache.commons.logging.{Log, LogFactory} import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.ml.linalg.{Vector, Vectors} class EI( override val surrogate: Surrogate, val par: Double) extends Acquisition(surrogate) { val LOG: Log = LogFactory.getLog(classOf[Surrogate]) override def compute(X: Vector, derivative: Boolean = false): (Double, Vector) = { val pred = surrogate.predict(X) // (mean, variance) // Use the best seen observation as incumbent val eta: Double = surrogate.curBest._2 //println(s"best seen result: $eta") val m: Double = pred._1 val s: Double = Math.sqrt(pred._2) //println(s"${X.toArray.mkString("(", ",", ")")}: mean[$m], variance[$s]") if (s == 0) { // if std is zero, we have observed x on all instances // using a RF, std should be never exactly 0.0 (0.0, Vectors.dense(new Array[Double](X.size))) } else { val z = (pred._1 - eta - par) / s val norm: NormalDistribution = new NormalDistribution val cdf: Double = norm.cumulativeProbability(z) val pdf: Double = norm.density(z) val ei = s * (z * cdf + pdf) //println(s"EI of ${X.toArray.mkString("(", ",", ")")}: $ei, cur best: $eta, z: $z, cdf: $cdf, pdf: $pdf") (ei, Vectors.dense(new Array[Double](X.size))) } } }
Example 23
Source File: KullbackLeiblerTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.spark.extensibility import java.lang.Math._ import org.scalaml.Logging import org.scalatest.{FlatSpec, Matchers} import org.apache.commons.math3.distribution.NormalDistribution import org.scalaml.spark.{DatasetGenerator, SessionLifeCycle} final class KullbackLeiblerTest extends FlatSpec with Matchers with Logging { import DatasetGenerator._ import SessionLifeCycle._ protected[this] val name = "Spark/Kullback Leibler" implicit private val sessionLifeCycle = new SessionLifeCycle {} implicit private val sparkSession = sessionLifeCycle.sparkSession final val normalGenerator = new NormalDistribution final val NUM_DATA_POINTS = 5000 final val normalData = toDSPairDouble(NUM_DATA_POINTS)((n: Int) => { val x = n.toDouble * 0.001 (x, normalGenerator.density(x)) }) it should s"$name divergence using Normal distribution mu=0" in { show(s"$name divergence using Normal distribution mu=0") val mu = 0.0 normalKL(mu) should be(0.0 +- 0.001) } it should s"$name divergence using Normal distribution mu=1.0" in { show(s"$name divergence using Normal distribution mu=1.0") val mu = 1.0 normalKL(mu) should be(0.01 +- 0.01) } it should s"$name divergence using Normal distribution mu=2.0" in { show(s"$name divergence using Normal distribution mu=2.0") val mu = 2.0 normalKL(mu) should be(-4.7 +- 0.2) } it should s"$name divergence using Normal distribution mu=3.0" in { show("$name divergence using Normal distribution mu=3.0") val mu = 3.0 normalKL(mu) should be(-180.0 +- 2.0) } private def normalKL(mu: Double): Double = { import Math._ val Inv2PI = 1.0 / sqrt(2.0 * PI) val pdf = (x: Double) => { val z = x - mu; Inv2PI * exp(-z * z) } val kullbackLeibler = KullbackLeibler(s"Normal mu=$mu", pdf) val klValue = kullbackLeibler.kl(normalData).head show(s"klValue for $mu $klValue") klValue } it should s"$name divergence using constant distribution" in { import Math._ val kullbackLeibler = KullbackLeibler("Constant", (x: Double) => 2.0) val klValue = kullbackLeibler.kl(normalData).head klValue should be(-7028.0 +- 10.0) } it should s"$name formula" in { type DataSeq = Seq[(Double, Double)] val Eps = 1e-12 val LogEps = log(Eps) def exec(xy: DataSeq, pdf: Double => Double): Double = { -xy./:(0.0) { case (s, (x, y)) => { val px = pdf(x) val z = if (abs(y) < Eps) px / Eps else px / y val t = if (z < Eps) LogEps else log(z) s + px * t } } } val h: Seq[(Double, Double)] = Seq.tabulate(1000)( (n: Int) => (n.toDouble * 0.001, normalGenerator.density(n.toDouble * 0.001)) ) val Inv2PI = 1.0 / sqrt(2.0 * PI) exec(h.iterator.toSeq, (x: Double) => Inv2PI * exp(-x * x)) should be(37.7 +- 0.1) } } // --------------------------- EOF -----------------------------------------------
Example 24
Source File: BootstrapTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License | 5 votes |
package org.scalaml.sampling import org.apache.commons.math3.distribution.{NormalDistribution, RealDistribution} import org.scalaml.Logging import org.scalatest.{FlatSpec, Matchers} import scala.collection.mutable.ArrayBuffer import scala.util.Random final class BootstrapTest extends FlatSpec with Matchers with Logging { protected val name = "Bootstrap sampling replicates" final val NumReplicates1 = 256 final val NumReplicates2 = 1024 final val NumDataPoints = 10000 private def bootstrapEvaluation( dist: RealDistribution, random: Random, coefs: (Double, Double), numReplicates: Int ): (Double, Double) = { val input = (0 until NumDataPoints)./:(new ArrayBuffer[(Double, Double)])( ( buf, _ ) => { val (a, b) = coefs val x = a * random.nextDouble - b buf += ( (x, dist.density(x)) ) } ).toVector // Bootstrap for the statistisx val bootstrap = new Bootstrap( numReplicates, (x: Vector[Double]) => x.sum/x.length, input.map( _._2 ), (rLen: Int) => new Random( System.currentTimeMillis).nextInt(rLen) ) (bootstrap.mean, bootstrap.error) } it should s"$name over a input with the distribution a*r + b $NumReplicates1 replicates" in { import Math._ show(s"$name over a input with the distribution a*r + b $NumReplicates1 replicates") val (meanNormal, errorNormal) = bootstrapEvaluation( new NormalDistribution, new scala.util.Random, (5.0, 2.5), NumReplicates1 ) val expectedMean = 0.185 show(s"$name meanNormal $meanNormal error $errorNormal") abs(expectedMean - meanNormal) < 0.05 should be (true) abs(errorNormal) < 0.05 should be (true) } it should s"$name over a input with the distribution a*r + b $NumReplicates2 replicates" in { import Math._ show("$name over a input with the distribution a*r + b $NumReplicates2 replicates") val (meanNormal, errorNormal) = bootstrapEvaluation( new NormalDistribution, new scala.util.Random, (5.0, 2.5), NumReplicates2 ) val expectedMean = 0.185 show(s"$name meanNormal $meanNormal error $errorNormal") abs(expectedMean - meanNormal) < 0.05 should be (true) abs(errorNormal) < 0.05 should be (true) } } // ----------------------------------- EOF -------------------------------------------
Example 25
Source File: KernelDensitySuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 26
Source File: MeanEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 27
Source File: MeanEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 28
Source File: CountEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.NormalDistribution private[spark] class CountEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[Long, BoundedDouble] { var outputsMerged = 0 var sum: Long = 0 override def merge(outputId: Int, taskResult: Long) { outputsMerged += 1 sum += taskResult } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(sum, 1.0, sum, sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 29
Source File: KernelDensitySuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 30
Source File: GroupedCountEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T,Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 31
Source File: StudentTCacher.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} private[spark] class StudentTCacher(confidence: Double) { val NORMAL_APPROX_SAMPLE_SIZE = 100 // For samples bigger than this, use Gaussian approximation val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0) def get(sampleSize: Long): Double = { if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) { normalApprox } else { val size = sampleSize.toInt if (cache(size) < 0) { val tDist = new TDistribution(size - 1) cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2) } cache(size) } } }
Example 32
Source File: EasyPlot.scala From spark-timeseries with Apache License 2.0 | 5 votes |
package com.cloudera.sparkts import org.apache.spark.mllib.linalg._ import breeze.plot._ import com.cloudera.sparkts.models.Autoregression import org.apache.commons.math3.distribution.NormalDistribution object EasyPlot { def ezplot(vec: Vector, style: Char): Figure = { val f = Figure() val p = f.subplot(0) p += plot((0 until vec.size).map(_.toDouble).toArray, vec.toArray, style = style) f } def ezplot(vec: Vector): Figure = ezplot(vec, '-') def ezplot(arr: Array[Double], style: Char): Figure = { val f = Figure() val p = f.subplot(0) p += plot(arr.indices.map(_.toDouble).toArray, arr, style = style) f } def ezplot(arr: Array[Double]): Figure = ezplot(arr, '-') def ezplot(vecs: Seq[Vector], style: Char): Figure = { val f = Figure() val p = f.subplot(0) val first = vecs.head vecs.foreach { vec => p += plot((0 until first.size).map(_.toDouble).toArray, vec.toArray, style) } f } def ezplot(vecs: Seq[Vector]): Figure = ezplot(vecs, '-') def pacfPlot(data: Array[Double], maxLag: Int, conf: Double = 0.95): Figure = { // create AR(maxLag) model, retrieve coefficients and calculate confidence bound val model = Autoregression.fitModel(new DenseVector(data), maxLag) val pCorrs = model.coefficients // partial autocorrelations are the coefficients in AR(n) model val confVal = calcConfVal(conf, data.length) // Basic plot information val f = Figure() val p = f.subplot(0) p.title = "Partial autocorrelation function" p.xlabel = "Lag" p.ylabel = "Partial Autocorrelation" drawCorrPlot(pCorrs, confVal, p) f } private[sparkts] def calcConfVal(conf: Double, n: Int): Double = { val stdNormDist = new NormalDistribution(0, 1) val pVal = (1 - conf) / 2.0 stdNormDist.inverseCumulativeProbability(1 - pVal) / Math.sqrt(n) } private[sparkts] def drawCorrPlot(corrs: Array[Double], confVal: Double, p: Plot): Unit = { // make decimal ticks visible p.setYAxisDecimalTickUnits() // plot correlations as vertical lines val verticalLines = corrs.zipWithIndex.map { case (corr, ix) => (Array(ix.toDouble + 1, ix.toDouble + 1), Array(0, corr)) } verticalLines.foreach { case (xs, ys) => p += plot(xs, ys) } // plot confidence intervals as horizontal lines val n = corrs.length Array(confVal, -1 * confVal).foreach { conf => val xs = (0 to n).toArray.map(_.toDouble) val ys = Array.fill(n + 1)(conf) p += plot(xs, ys, '-', colorcode = "red") } } }
Example 33
Source File: StatsSuite.scala From hail with MIT License | 5 votes |
package is.hail.stats import breeze.linalg.DenseMatrix import is.hail.TestUtils._ import is.hail.testUtils._ import is.hail.utils._ import is.hail.variant._ import is.hail.{HailSuite, TestUtils} import org.apache.commons.math3.distribution.{ChiSquaredDistribution, NormalDistribution} import org.testng.annotations.Test class StatsSuite extends HailSuite { @Test def chiSquaredTailTest() { val chiSq1 = new ChiSquaredDistribution(1) assert(D_==(chiSquaredTail(1d,1), 1 - chiSq1.cumulativeProbability(1d))) assert(D_==(chiSquaredTail(5.52341d,1), 1 - chiSq1.cumulativeProbability(5.52341d))) val chiSq2 = new ChiSquaredDistribution(2) assert(D_==(chiSquaredTail(1, 2), 1 - chiSq2.cumulativeProbability(1))) assert(D_==(chiSquaredTail(5.52341, 2), 1 - chiSq2.cumulativeProbability(5.52341))) val chiSq5 = new ChiSquaredDistribution(5.2) assert(D_==(chiSquaredTail(1, 5.2), 1 - chiSq5.cumulativeProbability(1))) assert(D_==(chiSquaredTail(5.52341, 5.2), 1 - chiSq5.cumulativeProbability(5.52341))) assert(D_==(inverseChiSquaredTail(.1, 1.0), chiSq1.inverseCumulativeProbability(1 - .1))) assert(D_==(inverseChiSquaredTail(.0001, 1.0), chiSq1.inverseCumulativeProbability(1 - .0001))) val a = List(.0000000001, .5, .9999999999, 1.0) a.foreach(p => assert(D_==(chiSquaredTail(inverseChiSquaredTail(p, 1.0), 1.0), p))) // compare with R assert(math.abs(chiSquaredTail(400, 1) - 5.507248e-89) < 1e-93) assert(D_==(inverseChiSquaredTail(5.507248e-89, 1), 400)) } @Test def normalTest() { val normalDist = new NormalDistribution() assert(D_==(pnorm(1), normalDist.cumulativeProbability(1))) assert(math.abs(pnorm(-10) - normalDist.cumulativeProbability(-10)) < 1e-10) assert(D_==(qnorm(.6), normalDist.inverseCumulativeProbability(.6))) assert(D_==(qnorm(.0001), normalDist.inverseCumulativeProbability(.0001))) val a = List(0.0, .0000000001, .5, .9999999999, 1.0) assert(a.forall(p => D_==(qnorm(pnorm(qnorm(p))), qnorm(p)))) // compare with R assert(math.abs(pnorm(-20) - 2.753624e-89) < 1e-93) assert(D_==(qnorm(2.753624e-89), -20)) } @Test def poissonTest() { // compare with R assert(D_==(dpois(5, 10), 0.03783327)) assert(qpois(0.3, 10) == 8) assert(qpois(0.3, 10, lowerTail = false, logP = false) == 12) assert(D_==(ppois(5, 10), 0.06708596)) assert(D_==(ppois(5, 10, lowerTail = false, logP = false), 0.932914)) assert(qpois(ppois(5, 10), 10) == 5) assert(qpois(ppois(5, 10, lowerTail = false, logP = false), 10, lowerTail = false, logP = false) == 5) assert(ppois(30, 1, lowerTail = false, logP = false) > 0) } @Test def betaTest() { val tol = 1e-5 assert(D_==(dbeta(.2 , 1, 3), 1.92, tol)) assert(D_==(dbeta(0.70, 2, 10), 0.001515591, tol)) assert(D_==(dbeta(.4, 5, 3), 0.96768, tol)) assert(D_==(dbeta(.3, 7, 2), 0.0285768, tol)) assert(D_==(dbeta(.8, 2, 2), .96, tol)) assert(D_==(dbeta(.1, 3, 6), 0.9920232, tol)) assert(D_==(dbeta(.6, 3, 4), 1.3824, tol)) assert(D_==(dbeta(.1, 1, 1), 1, tol)) assert(D_==(dbeta(.2, 4, 7), 1.761608, tol)) assert(D_==(dbeta(.2, 1, 2), 1.6, tol)) } @Test def entropyTest() { assert(D_==(entropy("accctg"), 1.79248, tolerance = 1e-5)) assert(D_==(entropy(Array(2, 3, 4, 5, 6, 6, 4)), 2.23593, tolerance = 1e-5)) } }
Example 34
Source File: KernelDensitySuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 35
Source File: MeanEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 36
Source File: KernelDensitySuite.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.SparkFunSuite import org.apache.spark.mllib.util.MLlibTestSparkContext class KernelDensitySuite extends SparkFunSuite with MLlibTestSparkContext { test("kernel density single sample") { val rdd = sc.parallelize(Array(5.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal = new NormalDistribution(5.0, 3.0) val acceptableErr = 1e-6 assert(math.abs(densities(0) - normal.density(5.0)) < acceptableErr) assert(math.abs(densities(1) - normal.density(6.0)) < acceptableErr) } test("kernel density multiple samples") { val rdd = sc.parallelize(Array(5.0, 10.0)) val evaluationPoints = Array(5.0, 6.0) val densities = new KernelDensity().setSample(rdd).setBandwidth(3.0).estimate(evaluationPoints) val normal1 = new NormalDistribution(5.0, 3.0) val normal2 = new NormalDistribution(10.0, 3.0) val acceptableErr = 1e-6 assert(math.abs( densities(0) - (normal1.density(5.0) + normal2.density(5.0)) / 2) < acceptableErr) assert(math.abs( densities(1) - (normal1.density(6.0) + normal2.density(6.0)) / 2) < acceptableErr) } }
Example 37
Source File: MeanEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }