org.apache.spark.util.collection.OpenHashMap Scala Examples
The following examples show how to use org.apache.spark.util.collection.OpenHashMap.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: RawTextHelper.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.SparkContext import org.apache.spark.util.collection.OpenHashMap private[streaming] object RawTextHelper { def warmUp(sc: SparkContext) { for (i <- 0 to 1) { sc.parallelize(1 to 200000, 1000) .map(_ % 1331).map(_.toString) .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10) .count() } } def add(v1: Long, v2: Long): Long = { v1 + v2 } def subtract(v1: Long, v2: Long): Long = { v1 - v2 } def max(v1: Long, v2: Long): Long = math.max(v1, v2) }
Example 2
Source File: GroupedCountEvaluator.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConverters._ import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result.put(key, new BoundedDouble(sum, 1.0, sum, sum)) } result.asScala } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result.put(key, new BoundedDouble(mean, confidence, low, high)) } result.asScala } } }
Example 3
Source File: RawTextHelper.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.SparkContext import org.apache.spark.util.collection.OpenHashMap private[streaming] object RawTextHelper { def warmUp(sc: SparkContext) { for (i <- 0 to 1) { sc.parallelize(1 to 200000, 1000) .map(_ % 1331).map(_.toString) .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10) .count() } } def add(v1: Long, v2: Long): Long = { v1 + v2 } def subtract(v1: Long, v2: Long): Long = { v1 - v2 } def max(v1: Long, v2: Long): Long = math.max(v1, v2) }
Example 4
Source File: RandomForestSuite.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.tree.impl import org.apache.spark.SparkFunSuite import org.apache.spark.ml.classification.DecisionTreeClassificationModel import org.apache.spark.ml.impl.TreeTests import org.apache.spark.ml.tree.{ContinuousSplit, DecisionTreeModel, LeafNode, Node} import org.apache.spark.mllib.linalg.{Vector, Vectors} import org.apache.spark.mllib.tree.impurity.GiniCalculator import org.apache.spark.mllib.util.MLlibTestSparkContext import org.apache.spark.mllib.util.TestingUtils._ import org.apache.spark.util.collection.OpenHashMap val leftImp = new GiniCalculator(Array(3.0, 2.0, 1.0)) val left = new LeafNode(0.0, leftImp.calculate(), leftImp) val rightImp = new GiniCalculator(Array(1.0, 2.0, 5.0)) val right = new LeafNode(2.0, rightImp.calculate(), rightImp) val parent = TreeTests.buildParentNode(left, right, new ContinuousSplit(0, 0.5)) val parentImp = parent.impurityStats val left2Imp = new GiniCalculator(Array(1.0, 6.0, 1.0)) val left2 = new LeafNode(0.0, left2Imp.calculate(), left2Imp) val grandParent = TreeTests.buildParentNode(left2, parent, new ContinuousSplit(1, 1.0)) val grandImp = grandParent.impurityStats // Test feature importance computed at different subtrees. def testNode(node: Node, expected: Map[Int, Double]): Unit = { val map = new OpenHashMap[Int, Double]() RandomForest.computeFeatureImportance(node, map) assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } // Leaf node testNode(left, Map.empty[Int, Double]) // Internal node with 2 leaf children val feature0importance = parentImp.calculate() * parentImp.count - (leftImp.calculate() * leftImp.count + rightImp.calculate() * rightImp.count) testNode(parent, Map(0 -> feature0importance)) // Full tree val feature1importance = grandImp.calculate() * grandImp.count - (left2Imp.calculate() * left2Imp.count + parentImp.calculate() * parentImp.count) testNode(grandParent, Map(0 -> feature0importance, 1 -> feature1importance)) // Forest consisting of (full tree) + (internal node with 2 leafs) val trees = Array(parent, grandParent).map { root => new DecisionTreeClassificationModel(root, numFeatures = 2, numClasses = 3) .asInstanceOf[DecisionTreeModel] } val importances: Vector = RandomForest.featureImportances(trees, 2) val tree2norm = feature0importance + feature1importance val expected = Vectors.dense((1.0 + feature0importance / tree2norm) / 2.0, (feature1importance / tree2norm) / 2.0) assert(importances ~== expected relTol 0.01) } test("normalizeMapValues") { val map = new OpenHashMap[Int, Double]() map(0) = 1.0 map(2) = 2.0 RandomForest.normalizeMapValues(map) val expected = Map(0 -> 1.0 / 3.0, 2 -> 2.0 / 3.0) assert(mapToVec(map.toMap) ~== mapToVec(expected) relTol 0.01) } } private object RandomForestSuite { def mapToVec(map: Map[Int, Double]): Vector = { val size = (map.keys.toSeq :+ 0).max + 1 val (indices, values) = map.toSeq.sortBy(_._1).unzip Vectors.sparse(size, indices.toArray, values.toArray) } }
Example 5
Source File: GroupedCountEvaluator.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { private var outputsMerged = 0 private val sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap } } }
Example 6
Source File: RawTextHelper.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.SparkContext import org.apache.spark.util.collection.OpenHashMap private[streaming] object RawTextHelper { def warmUp(sc: SparkContext) { for (i <- 0 to 1) { sc.parallelize(1 to 200000, 1000) .map(_ % 1331).map(_.toString) .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10) .count() } } def add(v1: Long, v2: Long): Long = { v1 + v2 } def subtract(v1: Long, v2: Long): Long = { v1 - v2 } def max(v1: Long, v2: Long): Long = math.max(v1, v2) }
Example 7
Source File: GroupedCountEvaluator.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 8
Source File: RawTextHelper.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.SparkContext import org.apache.spark.util.collection.OpenHashMap private[streaming] object RawTextHelper { def warmUp(sc: SparkContext) { for (i <- 0 to 1) { sc.parallelize(1 to 200000, 1000) .map(_ % 1331).map(_.toString) .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10) .count() } } def add(v1: Long, v2: Long): Long = { v1 + v2 } def subtract(v1: Long, v2: Long): Long = { v1 - v2 } def max(v1: Long, v2: Long): Long = math.max(v1, v2) }
Example 9
Source File: GroupedCountEvaluator.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 10
Source File: RawTextHelper.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.SparkContext import org.apache.spark.util.collection.OpenHashMap private[streaming] object RawTextHelper { def warmUp(sc: SparkContext) { for (i <- 0 to 1) { sc.parallelize(1 to 200000, 1000) .map(_ % 1331).map(_.toString) .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10) .count() } } def add(v1: Long, v2: Long): Long = { v1 + v2 } def subtract(v1: Long, v2: Long): Long = { v1 - v2 } def max(v1: Long, v2: Long): Long = math.max(v1, v2) }
Example 11
Source File: GroupedCountEvaluator.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { private var outputsMerged = 0 private val sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap } } }
Example 12
Source File: RawTextHelper.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.SparkContext import org.apache.spark.util.collection.OpenHashMap private[streaming] object RawTextHelper { def warmUp(sc: SparkContext) { for (i <- 0 to 1) { sc.parallelize(1 to 200000, 1000) .map(_ % 1331).map(_.toString) .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10) .count() } } def add(v1: Long, v2: Long): Long = { v1 + v2 } def subtract(v1: Long, v2: Long): Long = { v1 - v2 } def max(v1: Long, v2: Long): Long = math.max(v1, v2) }
Example 13
Source File: StringToShortIndexer.scala From spark-ext with Apache License 2.0 | 5 votes |
package org.apache.spark.ml.feature import org.apache.spark.SparkException import org.apache.spark.ml.{Estimator, Model} import org.apache.spark.ml.attribute.NominalAttribute import org.apache.spark.ml.param._ import org.apache.spark.ml.util.Identifiable import org.apache.spark.sql.DataFrame import org.apache.spark.sql.functions._ import org.apache.spark.sql.types._ import org.apache.spark.util.collection.OpenHashMap class StringToShortIndexer(override val uid: String) extends Estimator[StringToShortIndexerModel] with StringIndexerBase { def this() = this(Identifiable.randomUID("strShortIdx")) def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def fit(dataset: DataFrame): StringToShortIndexerModel = { val counts = dataset.select(col($(inputCol)).cast(StringType)) .map(_.getString(0)) .countByValue() val labels = counts.toSeq.sortBy(-_._2).map(_._1).toArray require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") copyValues(new StringToShortIndexerModel(uid, labels).setParent(this)) } override def transformSchema(schema: StructType): StructType = { validateAndTransformSchema(schema) } override def copy(extra: ParamMap): StringToShortIndexer = defaultCopy(extra) } class StringToShortIndexerModel ( override val uid: String, val labels: Array[String]) extends Model[StringToShortIndexerModel] with StringIndexerBase { def this(labels: Array[String]) = this(Identifiable.randomUID("strIdx"), labels) require(labels.length <= Short.MaxValue, s"Unique labels count (${labels.length}) should be less then Short.MaxValue (${Short.MaxValue})") private val labelToIndex: OpenHashMap[String, Short] = { val n = labels.length.toShort val map = new OpenHashMap[String, Short](n) var i: Short = 0 while (i < n) { map.update(labels(i), i) i = (i + 1).toShort } map } def setInputCol(value: String): this.type = set(inputCol, value) def setOutputCol(value: String): this.type = set(outputCol, value) override def transform(dataset: DataFrame): DataFrame = { if (!dataset.schema.fieldNames.contains($(inputCol))) { logInfo(s"Input column ${$(inputCol)} does not exist during transformation. " + "Skip StringToShortIndexerModel.") return dataset } val indexer = udf { label: String => if (labelToIndex.contains(label)) { labelToIndex(label) } else { // TODO: handle unseen labels throw new SparkException(s"Unseen label: $label.") } } val outputColName = $(outputCol) val metadata = NominalAttribute.defaultAttr .withName(outputColName).withValues(labels).toMetadata() dataset.select(col("*"), indexer(dataset($(inputCol)).cast(StringType)).as(outputColName, metadata)) } override def transformSchema(schema: StructType): StructType = { if (schema.fieldNames.contains($(inputCol))) { validateAndTransformSchema(schema) } else { // If the input column does not exist during transformation, we skip StringToShortIndexerModel. schema } } override def copy(extra: ParamMap): StringToShortIndexerModel = { val copied = new StringToShortIndexerModel(uid, labels) copyValues(copied, extra).setParent(parent) } }
Example 14
Source File: GroupedCountEvaluator.scala From SparkCore with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import java.util.{HashMap => JHashMap} import scala.collection.JavaConversions.mapAsScalaMap import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.commons.math3.distribution.NormalDistribution import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] { var outputsMerged = 0 var sums = new OpenHashMap[T,Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => result(key) = new BoundedDouble(sum, 1.0, sum, sum) } result } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs val confFactor = new NormalDistribution(). inverseCumulativeProbability(1 - (1 - confidence) / 2) val result = new JHashMap[T, BoundedDouble](sums.size) sums.foreach { case (key, sum) => val mean = (sum + 1 - p) / p val variance = (sum + 1) * (1 - p) / (p * p) val stdev = math.sqrt(variance) val low = mean - confFactor * stdev val high = mean + confFactor * stdev result(key) = new BoundedDouble(mean, confidence, low, high) } result } } }
Example 15
Source File: GroupedCountEvaluator.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { private var outputsMerged = 0 private val sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap } } }
Example 16
Source File: RawTextHelper.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.util import org.apache.spark.SparkContext import org.apache.spark.util.collection.OpenHashMap private[streaming] object RawTextHelper { def warmUp(sc: SparkContext) { for (i <- 0 to 1) { sc.parallelize(1 to 200000, 1000) .map(_ % 1331).map(_.toString) .mapPartitions(splitAndCountPartitions).reduceByKey(_ + _, 10) .count() } } def add(v1: Long, v2: Long): Long = { v1 + v2 } def subtract(v1: Long, v2: Long): Long = { v1 - v2 } def max(v1: Long, v2: Long): Long = math.max(v1, v2) }
Example 17
Source File: GroupedCountEvaluator.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.partial import scala.collection.Map import scala.collection.mutable.HashMap import scala.reflect.ClassTag import org.apache.spark.util.collection.OpenHashMap private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double) extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] { private var outputsMerged = 0 private val sums = new OpenHashMap[T, Long]() // Sum of counts for each key override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = { outputsMerged += 1 taskResult.foreach { case (key, value) => sums.changeValue(key, value, _ + value) } } override def currentResult(): Map[T, BoundedDouble] = { if (outputsMerged == totalOutputs) { sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap } else if (outputsMerged == 0) { new HashMap[T, BoundedDouble] } else { val p = outputsMerged.toDouble / totalOutputs sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap } } }