org.apache.spark.util.StatCounter Scala Example

Source File: MeanEvaluator.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: MeanEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: SumEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
}

Source File: GroupedMeanEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: EnsembleTestHelper.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) =>
      label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        //MAE平均绝对误差是所有单个观测值与算术平均值的偏差的绝对值的平均
       //math.abs返回数的绝对值
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

Source File: GroupedSumEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: MeanEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: SumEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
}

Source File: GroupedMeanEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: EnsembleTestHelper.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

Source File: GroupedSumEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: MeanEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class MeanEvaluatorSuite extends SparkFunSuite {

  test("test count 0") {
    val evaluator = new MeanEvaluator(10, 0.95)
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(0.0)))
    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    evaluator.merge(1, new StatCounter(Seq(3.0)))
    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(8.0)))
    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
  }

}

Source File: SumEvaluatorSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class SumEvaluatorSuite extends SparkFunSuite {

  test("correct handling of count 1") {
    // sanity check:
    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))

    // count of 10 because it's larger than 1,
    // and 0.95 because that's the default
    val evaluator = new SumEvaluator(10, 0.95)
    // arbitrarily assign id 1
    evaluator.merge(1, new StatCounter(Seq(2.0)))
    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of count 0") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of NaN") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
    val res = evaluator.currentResult()
    // assert - note semantics of == in face of NaN
    assert(res.mean.isNaN)
    assert(res.confidence == 0.95)
    assert(res.low == Double.NegativeInfinity)
    assert(res.high == Double.PositiveInfinity)
  }

  test("correct handling of > 1 values") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
    val res = evaluator.currentResult()
    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter().merge(1.0))
    evaluator.merge(1, new StatCounter().merge(3.0))
    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter().merge(8.0))
    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
  }

}

Source File: EnsembleTestHelper.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

Source File: GroupedSumEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result.put(entry.getKey, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result.put(entry.getKey, new BoundedDouble(sumEstimate, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: MeanEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: SumEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
}

Source File: GroupedMeanEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result.put(entry.getKey, new BoundedDouble(mean, 1.0, mean, mean))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(entry.getKey, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: DoubleDCFunctions.scala From spark-flow with Apache License 2.0

5 votes

package com.bloomberg.sparkflow.dc

import org.apache.spark.partial.{BoundedDouble, PartialResult}
import org.apache.spark.util.StatCounter


class DoubleDCFunctions(self: DC[Double]) {

  def sum: DR[Double] = {
    self.mapToResult(_.sum)
  }

  def stats: DR[StatCounter] = {
    self.mapToResult(_.stats)
  }

  def mean: DR[Double] = {
    self.mapToResult(_.mean)
  }

  def variance: DR[Double] = {
    self.mapToResult(_.variance)
  }

  def stdev: DR[Double] = {
    self.mapToResult(_.stdev)
  }

  def sampleStdev: DR[Double] = {
    self.mapToResult(_.sampleStdev)
  }

  def sampleVariance: DR[Double] = {
    self.mapToResult(_.sampleVariance)
  }

  //  Experimental
  def meanApprox(timeout: Long,
                 confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = {
    self.mapToResult(_.meanApprox(timeout, confidence))
  }

  //  Experimental
  def sumApprox(timeout: Long,
                confidence: Double = 0.95): DR[PartialResult[BoundedDouble]] = {
    self.mapToResult(_.sumApprox(timeout, confidence))
  }

  def histogram(bucketCount: Int): DR[(Array[Double], Array[Long])] = {
    self.mapToResult(_.histogram(bucketCount))
  }

  def histogram(buckets: Array[Double], evenBuckets: Boolean = false): DR[Array[Long]] = {
    self.mapToResult(_.histogram(buckets, evenBuckets))
  }

}

Source File: GroupedSumEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: EnsembleTestHelper.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

Source File: MeanEvaluator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: MeanEvaluatorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class MeanEvaluatorSuite extends SparkFunSuite {

  test("test count 0") {
    val evaluator = new MeanEvaluator(10, 0.95)
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(0.0)))
    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    evaluator.merge(1, new StatCounter(Seq(3.0)))
    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(8.0)))
    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
  }

}

Source File: SumEvaluatorSuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class SumEvaluatorSuite extends SparkFunSuite {

  test("correct handling of count 1") {
    // sanity check:
    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))

    // count of 10 because it's larger than 1,
    // and 0.95 because that's the default
    val evaluator = new SumEvaluator(10, 0.95)
    // arbitrarily assign id 1
    evaluator.merge(1, new StatCounter(Seq(2.0)))
    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of count 0") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of NaN") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
    val res = evaluator.currentResult()
    // assert - note semantics of == in face of NaN
    assert(res.mean.isNaN)
    assert(res.confidence == 0.95)
    assert(res.low == Double.NegativeInfinity)
    assert(res.high == Double.PositiveInfinity)
  }

  test("correct handling of > 1 values") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
    val res = evaluator.currentResult()
    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter().merge(1.0))
    evaluator.merge(1, new StatCounter().merge(3.0))
    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter().merge(8.0))
    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
  }

}

Source File: EnsembleTestHelper.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

Source File: MeanEvaluator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: MeanEvaluatorSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class MeanEvaluatorSuite extends SparkFunSuite {

  test("test count 0") {
    val evaluator = new MeanEvaluator(10, 0.95)
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(0.0)))
    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    evaluator.merge(1, new StatCounter(Seq(3.0)))
    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(8.0)))
    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
  }

}

Source File: SumEvaluatorSuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class SumEvaluatorSuite extends SparkFunSuite {

  test("correct handling of count 1") {
    // sanity check:
    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))

    // count of 10 because it's larger than 1,
    // and 0.95 because that's the default
    val evaluator = new SumEvaluator(10, 0.95)
    // arbitrarily assign id 1
    evaluator.merge(1, new StatCounter(Seq(2.0)))
    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of count 0") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of NaN") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
    val res = evaluator.currentResult()
    // assert - note semantics of == in face of NaN
    assert(res.mean.isNaN)
    assert(res.confidence == 0.95)
    assert(res.low == Double.NegativeInfinity)
    assert(res.high == Double.PositiveInfinity)
  }

  test("correct handling of > 1 values") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
    val res = evaluator.currentResult()
    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter().merge(1.0))
    evaluator.merge(1, new StatCounter().merge(3.0))
    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter().merge(8.0))
    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
  }

}

Source File: StreamingTestMethod.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.stat.test

import java.io.Serializable

import scala.language.implicitConversions
import scala.math.pow

import com.twitter.chill.MeatLocker
import org.apache.commons.math3.stat.descriptive.StatisticalSummaryValues
import org.apache.commons.math3.stat.inference.TTest

import org.apache.spark.internal.Logging
import org.apache.spark.streaming.dstream.DStream
import org.apache.spark.util.StatCounter


private[stat] object StreamingTestMethod {
  // Note: after new `StreamingTestMethod`s are implemented, please update this map.
  private final val TEST_NAME_TO_OBJECT: Map[String, StreamingTestMethod] = Map(
    "welch" -> WelchTTest,
    "student" -> StudentTTest)

  def getTestMethodFromName(method: String): StreamingTestMethod =
    TEST_NAME_TO_OBJECT.get(method) match {
      case Some(test) => test
      case None =>
        throw new IllegalArgumentException(
          "Unrecognized method name. Supported streaming test methods: "
            + TEST_NAME_TO_OBJECT.keys.mkString(", "))
    }
}

Source File: MeanEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: SumEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{TDistribution, NormalDistribution}

import org.apache.spark.util.StatCounter


private[spark] class SumEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  var outputsMerged = 0
  var counter = new StatCounter

  override def merge(outputId: Int, taskResult: StatCounter) {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.sum, 1.0, counter.sum, counter.sum)
    } else if (outputsMerged == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val meanEstimate = counter.mean
      val meanVar = counter.sampleVariance / counter.count
      val countEstimate = (counter.count + 1 - p) / p
      val countVar = (counter.count + 1) * (1 - p) / (p * p)
      val sumEstimate = meanEstimate * countEstimate
      val sumVar = (meanEstimate * meanEstimate * countVar) +
                   (countEstimate * countEstimate * meanVar) +
                   (meanVar * countVar)
      val sumStdev = math.sqrt(sumVar)
      val confFactor = {
        if (counter.count > 100) {
          new NormalDistribution().inverseCumulativeProbability(1 - (1 - confidence) / 2)
        } else {
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability(1 - (1 - confidence) / 2)
        }
      }
      val low = sumEstimate - confFactor * sumStdev
      val high = sumEstimate + confFactor * sumStdev
      new BoundedDouble(sumEstimate, confidence, low, high)
    }
  }
}

Source File: GroupedMeanEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: EnsembleTestHelper.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import scala.collection.mutable

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input).map { case (prediction, point) =>
      point.label - prediction
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

Source File: MeanEvaluator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.commons.math3.distribution.{NormalDistribution, TDistribution}

import org.apache.spark.util.StatCounter


private[spark] class MeanEvaluator(totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[StatCounter, BoundedDouble] {

  private var outputsMerged = 0
  private val counter = new StatCounter()

  override def merge(outputId: Int, taskResult: StatCounter): Unit = {
    outputsMerged += 1
    counter.merge(taskResult)
  }

  override def currentResult(): BoundedDouble = {
    if (outputsMerged == totalOutputs) {
      new BoundedDouble(counter.mean, 1.0, counter.mean, counter.mean)
    } else if (outputsMerged == 0 || counter.count == 0) {
      new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity)
    } else if (counter.count == 1) {
      new BoundedDouble(counter.mean, confidence, Double.NegativeInfinity, Double.PositiveInfinity)
    } else {
      val mean = counter.mean
      val stdev = math.sqrt(counter.sampleVariance / counter.count)
      val confFactor = if (counter.count > 100) {
          // For large n, the normal distribution is a good approximation to t-distribution
          new NormalDistribution().inverseCumulativeProbability((1 + confidence) / 2)
        } else {
          // t-distribution describes distribution of actual population mean
          // note that if this goes to 0, TDistribution will throw an exception.
          // Hence special casing 1 above.
          val degreesOfFreedom = (counter.count - 1).toInt
          new TDistribution(degreesOfFreedom).inverseCumulativeProbability((1 + confidence) / 2)
        }
      // Symmetric, so confidence interval is symmetric about mean of distribution
      val low = mean - confFactor * stdev
      val high = mean + confFactor * stdev
      new BoundedDouble(mean, confidence, low, high)
    }
  }
}

Source File: MeanEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class MeanEvaluatorSuite extends SparkFunSuite {

  test("test count 0") {
    val evaluator = new MeanEvaluator(10, 0.95)
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0.0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(0.0)))
    assert(new BoundedDouble(0.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    assert(new BoundedDouble(1.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new MeanEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0)))
    evaluator.merge(1, new StatCounter(Seq(3.0)))
    assert(new BoundedDouble(2.0, 0.95, -10.706204736174746, 14.706204736174746) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter(Seq(8.0)))
    assert(new BoundedDouble(4.0, 0.95, -4.9566858949231225, 12.956685894923123) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter(Seq(9.0))))
    assert(new BoundedDouble(7.5, 1.0, 7.5, 7.5) == evaluator.currentResult())
  }

}

Source File: SumEvaluatorSuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import org.apache.spark.SparkFunSuite
import org.apache.spark.util.StatCounter

class SumEvaluatorSuite extends SparkFunSuite {

  test("correct handling of count 1") {
    // sanity check:
    assert(new BoundedDouble(2.0, 0.95, 1.1, 1.2) == new BoundedDouble(2.0, 0.95, 1.1, 1.2))

    // count of 10 because it's larger than 1,
    // and 0.95 because that's the default
    val evaluator = new SumEvaluator(10, 0.95)
    // arbitrarily assign id 1
    evaluator.merge(1, new StatCounter(Seq(2.0)))
    assert(new BoundedDouble(20.0, 0.95, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of count 0") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter())
    assert(new BoundedDouble(0, 0.0, Double.NegativeInfinity, Double.PositiveInfinity) ==
      evaluator.currentResult())
  }

  test("correct handling of NaN") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1, Double.NaN, 2)))
    val res = evaluator.currentResult()
    // assert - note semantics of == in face of NaN
    assert(res.mean.isNaN)
    assert(res.confidence == 0.95)
    assert(res.low == Double.NegativeInfinity)
    assert(res.high == Double.PositiveInfinity)
  }

  test("correct handling of > 1 values") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter(Seq(1.0, 3.0, 2.0)))
    val res = evaluator.currentResult()
    assert(new BoundedDouble(60.0, 0.95, -101.7362525347778, 221.7362525347778) ==
      evaluator.currentResult())
  }

  test("test count > 1") {
    val evaluator = new SumEvaluator(10, 0.95)
    evaluator.merge(1, new StatCounter().merge(1.0))
    evaluator.merge(1, new StatCounter().merge(3.0))
    assert(new BoundedDouble(20.0, 0.95, -186.4513905077019, 226.4513905077019) ==
      evaluator.currentResult())
    evaluator.merge(1, new StatCounter().merge(8.0))
    assert(new BoundedDouble(40.0, 0.95, -72.75723361226733, 152.75723361226733) ==
      evaluator.currentResult())
    (4 to 10).foreach(_ => evaluator.merge(1, new StatCounter().merge(9.0)))
    assert(new BoundedDouble(75.0, 1.0, 75.0, 75.0) == evaluator.currentResult())
  }

}

Source File: EnsembleTestHelper.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.mllib.tree

import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.mllib.regression.LabeledPoint
import org.apache.spark.mllib.tree.model.TreeEnsembleModel
import org.apache.spark.util.StatCounter

import scala.collection.mutable

object EnsembleTestHelper {

  
  def validateRegressor(
      model: TreeEnsembleModel,
      input: Seq[LabeledPoint],
      required: Double,
      metricName: String = "mse") {
    val predictions = input.map(x => model.predict(x.features))
    val errors = predictions.zip(input.map(_.label)).map { case (prediction, label) =>
      prediction - label
    }
    val metric = metricName match {
      case "mse" =>
        errors.map(err => err * err).sum / errors.size
      case "mae" =>
        errors.map(math.abs).sum / errors.size
    }

    assert(metric <= required,
      s"validateRegressor calculated $metricName $metric but required $required.")
  }

  def generateOrderedLabeledPoints(numFeatures: Int, numInstances: Int): Array[LabeledPoint] = {
    val arr = new Array[LabeledPoint](numInstances)
    for (i <- 0 until numInstances) {
      val label = if (i < numInstances / 10) {
        0.0
      } else if (i < numInstances / 2) {
        1.0
      } else if (i < numInstances * 0.9) {
        0.0
      } else {
        1.0
      }
      val features = Array.fill[Double](numFeatures)(i.toDouble)
      arr(i) = new LabeledPoint(label, Vectors.dense(features))
    }
    arr
  }

}

org.apache.spark.util.StatCounter Scala Examples