org.apache.spark.util.StatCounter Scala Examples
The following examples show how to use org.apache.spark.util.StatCounter.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: MeanEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 2
Source File: MeanEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 3
Source File: SumEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 4
Source File: GroupedMeanEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val mean = entry.getValue.mean result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = studentTCacher.get(counter.count) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(entry.getKey) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 5
Source File: EnsembleTestHelper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter import scala.collection.mutable object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) => label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => //MAE平均绝对误差是所有单个观测值与算术平均值的偏差的绝对值的平均 //math.abs返回数的绝对值 errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 6
Source File: GroupedSumEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val sum = entry.getValue.sum result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = studentTCacher.get(counter.count) val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high) } result } } }
Example 7
Source File: MeanEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 8
Source File: SumEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 9
Source File: GroupedMeanEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val mean = entry.getValue.mean result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = studentTCacher.get(counter.count) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(entry.getKey) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 10
Source File: EnsembleTestHelper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 11
Source File: GroupedSumEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val sum = entry.getValue.sum result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = studentTCacher.get(counter.count) val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high) } result } } }
Example 12
Source File: MeanEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.SparkFunSuite import org.apache.spark.util.StatCounter class MeanEvaluatorSuite extends SparkFunSuite { test("test count 0") { val evaluator = new MeanEvaluator(10, 0.95) assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) evaluator.merge(1, new StatCounter()) assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) evaluator.merge(1, new StatCounter(Seq(0.0))) assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("test count 1") { val evaluator = new MeanEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0))) assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("test count > 1") { val evaluator = new MeanEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0))) evaluator.merge(1, new StatCounter(Seq(3.0))) assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) == evaluator.currentResult()) evaluator.merge(1, new StatCounter(Seq(8.0))) assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) == evaluator.currentResult()) (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0)))) assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult()) } }
Example 13
Source File: SumEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.SparkFunSuite import org.apache.spark.util.StatCounter class SumEvaluatorSuite extends SparkFunSuite { test("correct handling of count 1") { // sanity check: assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2)) // count of 10 because it's larger than 1, // and 0.95 because that's the default val evaluator = new SumEvaluator(10, 0.95) // arbitrarily assign id 1 evaluator.merge(1, new StatCounter(Seq(2.0))) assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("correct handling of count 0") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter()) assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("correct handling of NaN") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2))) val res = evaluator.currentResult() // assert - note semantics of == in face of NaN assert(res.mean.isNaN) assert(res.confidence == 0.95) assert(res.low == Double.NegativeInfinity) assert(res.high == Double.PositiveInfinity) } test("correct handling of > 1 values") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0))) val res = evaluator.currentResult() assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) == evaluator.currentResult()) } test("test count > 1") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter().merge(1.0)) evaluator.merge(1, new StatCounter().merge(3.0)) assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) == evaluator.currentResult()) evaluator.merge(1, new StatCounter().merge(8.0)) assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) == evaluator.currentResult()) (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0))) assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult()) } }
Example 14
Source File: EnsembleTestHelper.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter import scala.collection.mutable object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 15
Source File: GroupedSumEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val sum = entry.getValue.sum result.put(entry.getKey, new BoundedDouble(sum, 1.0, sum, sum)) } result.asScala } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = studentTCacher.get(counter.count) val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev result.put(entry.getKey, new BoundedDouble(sumEstimate, confidence, low, high)) } result.asScala } } }
Example 16
Source File: MeanEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 17
Source File: SumEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 18
Source File: GroupedMeanEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val mean = entry.getValue.mean result.put(entry.getKey, new BoundedDouble(mean, 1.0, mean, mean)) } result.asScala } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = studentTCacher.get(counter.count) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result.put(entry.getKey, new BoundedDouble(mean, confidence, low, high)) } result.asScala } } }
Example 19
Source File: DoubleDCFunctions.scala From spark-flow with Apache License 2.0 | 5 votes |
package com.bloomberg.sparkflow.dc import org.apache.spark.partial.{BoundedDouble, PartialResult} import org.apache.spark.util.StatCounter class DoubleDCFunctions(self: DC[Double]) { def sum: DR[Double] = { self.mapToResult(_.sum) } def stats: DR[StatCounter] = { self.mapToResult(_.stats) } def mean: DR[Double] = { self.mapToResult(_.mean) } def variance: DR[Double] = { self.mapToResult(_.variance) } def stdev: DR[Double] = { self.mapToResult(_.stdev) } def sampleStdev: DR[Double] = { self.mapToResult(_.sampleStdev) } def sampleVariance: DR[Double] = { self.mapToResult(_.sampleVariance) } // Experimental def meanApprox(timeout: Long, confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = { self.mapToResult(_.meanApprox(timeout, confidence)) } // Experimental def sumApprox(timeout: Long, confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = { self.mapToResult(_.sumApprox(timeout, confidence)) } def histogram(bucketCount: Int): DR[(Array[Double], Array[Long])] = { self.mapToResult(_.histogram(bucketCount)) } def histogram(buckets: Array[Double], evenBuckets: Boolean = false): DR[Array[Long]] = { self.mapToResult(_.histogram(buckets, evenBuckets)) } }
Example 20
Source File: GroupedSumEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val sum = entry.getValue.sum result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = studentTCacher.get(counter.count) val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high) } result } } }
Example 21
Source File: EnsembleTestHelper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 22
Source File: MeanEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 23
Source File: MeanEvaluatorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.SparkFunSuite import org.apache.spark.util.StatCounter class MeanEvaluatorSuite extends SparkFunSuite { test("test count 0") { val evaluator = new MeanEvaluator(10, 0.95) assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) evaluator.merge(1, new StatCounter()) assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) evaluator.merge(1, new StatCounter(Seq(0.0))) assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("test count 1") { val evaluator = new MeanEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0))) assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("test count > 1") { val evaluator = new MeanEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0))) evaluator.merge(1, new StatCounter(Seq(3.0))) assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) == evaluator.currentResult()) evaluator.merge(1, new StatCounter(Seq(8.0))) assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) == evaluator.currentResult()) (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0)))) assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult()) } }
Example 24
Source File: SumEvaluatorSuite.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.SparkFunSuite import org.apache.spark.util.StatCounter class SumEvaluatorSuite extends SparkFunSuite { test("correct handling of count 1") { // sanity check: assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2)) // count of 10 because it's larger than 1, // and 0.95 because that's the default val evaluator = new SumEvaluator(10, 0.95) // arbitrarily assign id 1 evaluator.merge(1, new StatCounter(Seq(2.0))) assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("correct handling of count 0") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter()) assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("correct handling of NaN") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2))) val res = evaluator.currentResult() // assert - note semantics of == in face of NaN assert(res.mean.isNaN) assert(res.confidence == 0.95) assert(res.low == Double.NegativeInfinity) assert(res.high == Double.PositiveInfinity) } test("correct handling of > 1 values") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0))) val res = evaluator.currentResult() assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) == evaluator.currentResult()) } test("test count > 1") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter().merge(1.0)) evaluator.merge(1, new StatCounter().merge(3.0)) assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) == evaluator.currentResult()) evaluator.merge(1, new StatCounter().merge(8.0)) assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) == evaluator.currentResult()) (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0))) assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult()) } }
Example 25
Source File: EnsembleTestHelper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 26
Source File: MeanEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 27
Source File: MeanEvaluatorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.SparkFunSuite import org.apache.spark.util.StatCounter class MeanEvaluatorSuite extends SparkFunSuite { test("test count 0") { val evaluator = new MeanEvaluator(10, 0.95) assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) evaluator.merge(1, new StatCounter()) assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) evaluator.merge(1, new StatCounter(Seq(0.0))) assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("test count 1") { val evaluator = new MeanEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0))) assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("test count > 1") { val evaluator = new MeanEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0))) evaluator.merge(1, new StatCounter(Seq(3.0))) assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) == evaluator.currentResult()) evaluator.merge(1, new StatCounter(Seq(8.0))) assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) == evaluator.currentResult()) (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0)))) assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult()) } }
Example 28
Source File: SumEvaluatorSuite.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.SparkFunSuite import org.apache.spark.util.StatCounter class SumEvaluatorSuite extends SparkFunSuite { test("correct handling of count 1") { // sanity check: assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2)) // count of 10 because it's larger than 1, // and 0.95 because that's the default val evaluator = new SumEvaluator(10, 0.95) // arbitrarily assign id 1 evaluator.merge(1, new StatCounter(Seq(2.0))) assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("correct handling of count 0") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter()) assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("correct handling of NaN") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2))) val res = evaluator.currentResult() // assert - note semantics of == in face of NaN assert(res.mean.isNaN) assert(res.confidence == 0.95) assert(res.low == Double.NegativeInfinity) assert(res.high == Double.PositiveInfinity) } test("correct handling of > 1 values") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0))) val res = evaluator.currentResult() assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) == evaluator.currentResult()) } test("test count > 1") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter().merge(1.0)) evaluator.merge(1, new StatCounter().merge(3.0)) assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) == evaluator.currentResult()) evaluator.merge(1, new StatCounter().merge(8.0)) assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) == evaluator.currentResult()) (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0))) assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult()) } }
Example 29
Source File: StreamingTestMethod.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.stat.test import java.io.Serializable import scala.language.implicitConversions import scala.math.pow import com.twitter.chill.MeatLocker import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues import org.apache.commons.math3.stat.inference.TTest import org.apache.spark.internal.Logging import org.apache.spark.streaming.dstream.DStream import org.apache.spark.util.StatCounter private[stat] object StreamingTestMethod { // Note: after new `StreamingTestMethod`s are implemented, please update this map. private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map( "welch" -> WelchTTest, "student" -> StudentTTest) def getTestMethodFromName(method: String): StreamingTestMethod = TEST_NAME_TO_OBJECT.get(method) match { case Some(test) => test case None => throw new IllegalArgumentException( "Unrecognized method name. Supported streaming test methods: " + TEST_NAME_TO_OBJECT.keys.mkString(", ")) } }
Example 30
Source File: MeanEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 31
Source File: SumEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution} import org.apache.spark.util.StatCounter private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { var outputsMerged = 0 var counter = new StatCounter override def merge(outputId: Int, taskResult: StatCounter) { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum) } else if (outputsMerged == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else { val p = outputsMerged.toDouble / totalOutputs val meanEstimate = counter.mean val meanVar = counter.sampleVariance / counter.count val countEstimate = (counter.count + 1 - p) / p val countVar = (counter.count + 1) * (1 - p) / (p * p) val sumEstimate = meanEstimate * countEstimate val sumVar = (meanEstimate * meanEstimate * countVar) + (countEstimate * countEstimate * meanVar) + (meanVar * countVar) val sumStdev = math.sqrt(sumVar) val confFactor = { if (counter.count > 100) { new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2) } else { val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2) } } val low = sumEstimate - confFactor * sumStdev val high = sumEstimate + confFactor * sumStdev new BoundedDouble(sumEstimate, confidence, low, high) } } }
Example 32
Source File: GroupedMeanEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import org.apache.spark.util.StatCounter private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new JHashMap[T, StatCounter] // Sum of counts for each key override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) { outputsMerged += 1 val iter = taskResult.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val old = sums.get(entry.getKey) if (old != null) { old.merge(entry.getValue) } else { sums.put(entry.getKey, entry.getValue) } } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val mean = entry.getValue.mean result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val studentTCacher = new StudentTCacher(confidence) val result = new JHashMap[T, BoundedDouble](sums.size) val iter = sums.entrySet.iterator() while (iter.hasNext) { val entry = iter.next() val counter = entry.getValue val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = studentTCacher.get(counter.count) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(entry.getKey) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 33
Source File: EnsembleTestHelper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import scala.collection.mutable import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input).map { case (prediction, point) => point.label - prediction } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }
Example 34
Source File: MeanEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution} import org.apache.spark.util.StatCounter private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[StatCounter, BoundedDouble] { private var outputsMerged = 0 private val counter = new StatCounter() override def merge(outputId: Int, taskResult: StatCounter): Unit = { outputsMerged += 1 counter.merge(taskResult) } override def currentResult(): BoundedDouble = { if (outputsMerged == totalOutputs) { new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean) } else if (outputsMerged == 0 || counter.count == 0) { new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) } else if (counter.count == 1) { new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity) } else { val mean = counter.mean val stdev = math.sqrt(counter.sampleVariance / counter.count) val confFactor = if (counter.count > 100) { // For large n, the normal distribution is a good approximation to t-distribution new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2) } else { // t-distribution describes distribution of actual population mean // note that if this goes to 0, TDistribution will throw an exception. // Hence special casing 1 above. val degreesOfFreedom = (counter.count - 1).toInt new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2) } // Symmetric, so confidence interval is symmetric about mean of distribution val low = mean - confFactor * stdev val high = mean + confFactor * stdev new BoundedDouble(mean, confidence, low, high) } } }
Example 35
Source File: MeanEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.SparkFunSuite import org.apache.spark.util.StatCounter class MeanEvaluatorSuite extends SparkFunSuite { test("test count 0") { val evaluator = new MeanEvaluator(10, 0.95) assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) evaluator.merge(1, new StatCounter()) assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) evaluator.merge(1, new StatCounter(Seq(0.0))) assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("test count 1") { val evaluator = new MeanEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0))) assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("test count > 1") { val evaluator = new MeanEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0))) evaluator.merge(1, new StatCounter(Seq(3.0))) assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) == evaluator.currentResult()) evaluator.merge(1, new StatCounter(Seq(8.0))) assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) == evaluator.currentResult()) (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0)))) assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult()) } }
Example 36
Source File: SumEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import org.apache.spark.SparkFunSuite import org.apache.spark.util.StatCounter class SumEvaluatorSuite extends SparkFunSuite { test("correct handling of count 1") { // sanity check: assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2)) // count of 10 because it's larger than 1, // and 0.95 because that's the default val evaluator = new SumEvaluator(10, 0.95) // arbitrarily assign id 1 evaluator.merge(1, new StatCounter(Seq(2.0))) assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("correct handling of count 0") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter()) assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) == evaluator.currentResult()) } test("correct handling of NaN") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2))) val res = evaluator.currentResult() // assert - note semantics of == in face of NaN assert(res.mean.isNaN) assert(res.confidence == 0.95) assert(res.low == Double.NegativeInfinity) assert(res.high == Double.PositiveInfinity) } test("correct handling of > 1 values") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0))) val res = evaluator.currentResult() assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) == evaluator.currentResult()) } test("test count > 1") { val evaluator = new SumEvaluator(10, 0.95) evaluator.merge(1, new StatCounter().merge(1.0)) evaluator.merge(1, new StatCounter().merge(3.0)) assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) == evaluator.currentResult()) evaluator.merge(1, new StatCounter().merge(8.0)) assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) == evaluator.currentResult()) (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0))) assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult()) } }
Example 37
Source File: EnsembleTestHelper.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.mllib.tree import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.mllib.regression.LabeledPoint import org.apache.spark.mllib.tree.model.TreeEnsembleModel import org.apache.spark.util.StatCounter import scala.collection.mutable object EnsembleTestHelper { def validateRegressor( model: TreeEnsembleModel, input: Seq[LabeledPoint], required: Double, metricName: String = "mse") { val predictions = input.map(x => model.predict(x.features)) val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) => prediction - label } val metric = metricName match { case "mse" => errors.map(err => err * err).sum / errors.size case "mae" => errors.map(math.abs).sum / errors.size } assert(metric <= required, s"validateRegressor calculated $metricName $metric but required $required.") } def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = { val arr = new Array[LabeledPoint](numInstances) for (i <- 0 until numInstances) { val label = if (i < numInstances / 10) { 0.0 } else if (i < numInstances / 2) { 1.0 } else if (i < numInstances * 0.9) { 0.0 } else { 1.0 } val features = Array.fill[Double](numFeatures)(i.toDouble) arr(i) = new LabeledPoint(label, Vectors.dense(features)) } arr } }