org.apache.commons.math3.distribution.TDistribution Scala Examples
The following examples show how to use org.apache.commons.math3.distribution.TDistribution.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: StudentTCacher.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} private[spark] class StudentTCacher(confidence: Double) { val NORMAL_APPROX_SAMPLE_SIZE = 100 // For samples bigger than this, use Gaussian approximation val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0) def get(sampleSize: Long): Double = { if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) { normalApprox } else { val size = sampleSize.toInt if (cache(size) < 0) { val tDist = new TDistribution(size - 1) cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2) } cache(size) } } }
Example 2
Source File: LinearRegressionGwas.scala From glow with Apache License 2.0 | 5 votes |
package io.projectglow.sql.expressions import breeze.linalg.DenseVector import org.apache.commons.math3.distribution.TDistribution import org.apache.commons.math3.util.FastMath import org.apache.spark.sql.catalyst.InternalRow import io.projectglow.common.GlowLogging case class RegressionStats(beta: Double, standardError: Double, pValue: Double) object LinearRegressionGwas extends GlowLogging { def runRegression( genotypes: DenseVector[Double], phenotypes: DenseVector[Double], covariateQRContext: CovariateQRContext): RegressionStats = { require( genotypes.length == phenotypes.length, "Number of samples differs between genotype and phenotype arrays") require( covariateQRContext.covQt.cols == genotypes.length, "Number of samples differs between genotype array and covariate matrix") val qtx = covariateQRContext.covQt * genotypes val qty = covariateQRContext.covQt * phenotypes val xdoty = (phenotypes dot genotypes) - (qty dot qtx) val xdotx = (genotypes dot genotypes) - (qtx dot qtx) val ydoty = (phenotypes dot phenotypes) - (qty dot qty) val beta = xdoty / xdotx val standardError = FastMath.sqrt((ydoty / xdotx - beta * beta) / covariateQRContext.degreesOfFreedom) // t-statistic val t = beta / standardError val tDist = new TDistribution(covariateQRContext.degreesOfFreedom) val pvalue = 2 * tDist.cumulativeProbability(-Math.abs(t)) RegressionStats(beta, standardError, pvalue) } def linearRegressionGwas( genotypes: DenseVector[Double], phenotypes: DenseVector[Double], covariateQR: CovariateQRContext): InternalRow = { val regressionStats = runRegression(genotypes, phenotypes, covariateQR) InternalRow(regressionStats.beta, regressionStats.standardError, regressionStats.pValue) } }
Example 3
Source File: StudentTCacher.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} private[spark] class StudentTCacher(confidence: Double) { val NORMAL_APPROX_SAMPLE_SIZE = 100 // For samples bigger than this, use Gaussian approximation val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0) def get(sampleSize: Long): Double = { if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) { normalApprox } else { val size = sampleSize.toInt if (cache(size) < 0) { val tDist = new TDistribution(size - 1) cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2) } cache(size) } } }
Example 4
Source File: SumEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 5
Source File: MeanEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 6
Source File: MeanEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 7
Source File: StudentTCacher.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} private[spark] class StudentTCacher(confidence: Double) { //对于大于此的样本,使用高斯近似 val NORMAL_APPROX_SAMPLE_SIZE = 100 // For samples bigger than this, use Gaussian approximation val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0) def get(sampleSize: Long): Double = { if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) { normalApprox } else { val size = sampleSize.toInt if (cache(size) < 0) { val tDist = new TDistribution(size - 1) cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2) } cache(size) } } }
Example 8
Source File: SumEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 9
Source File: MeanEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 10
Source File: MeanEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 11
Source File: SumEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 12
Source File: MeanEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 13
Source File: MeanEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 14
Source File: TwoSampleIndependentTTest.scala From StatisticsOnSpark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat import org.apache.commons.math3.distribution.TDistribution import org.apache.commons.math3.util.FastMath import org.apache.spark.rdd.RDD def tTest(sample1: RDD[Double], sample2: RDD[Double]): Double = { val n1 = sample1.count() val n2 = sample2.count() val m1 = sample1.sum() / n1 val m2 = sample2.sum() / n2 val v1 = sample1.map(d => (d - m1) * (d - m1)).sum() / (n1 - 1) val v2 = sample2.map(d => (d - m2) * (d - m2)).sum() / (n2 - 1) val t: Double = math.abs((m1 - m2) / FastMath.sqrt((v1 / n1) + (v2 / n2))) val degreesOfFreedom: Double = (((v1 / n1) + (v2 / n2)) * ((v1 / n1) + (v2 / n2))) / ((v1 * v1) / (n1 * n1 * (n1 - 1d)) + (v2 * v2) / (n2 * n2 * (n2 - 1d))) // pass a null rng to avoid unneeded overhead as we will not sample from this distribution val distribution: TDistribution = new TDistribution(null, degreesOfFreedom) 2.0 * distribution.cumulativeProbability(-t) } }
Example 15
Source File: StudentTCacher.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} private[spark] class StudentTCacher(confidence: Double) { val NORMAL_APPROX_SAMPLE_SIZE = 100 // For samples bigger than this, use Gaussian approximation val normalApprox = new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) val cache = Array.fill[Double](NORMAL_APPROX_SAMPLE_SIZE)(-1.0) def get(sampleSize: Long): Double = { if (sampleSize >= NORMAL_APPROX_SAMPLE_SIZE) { normalApprox } else { val size = sampleSize.toInt if (cache(size) < 0) { val tDist = new TDistribution(size - 1) cache(size) = tDist.inverseCumulativeProbability(1 - (1 - confidence) / 2) } cache(size) } } }
Example 16
Source File: SumEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 17
Source File: MeanEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 18
Source File: MeanEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }