scala.collection.mutable.HashSet Scala Examples
The following examples show how to use scala.collection.mutable.HashSet.
You can vote up the ones you like or vote down the ones you don't like,
and go to the original project or source file by following the links above each example.
Example 1
Source File: package.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable.HashSet import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.{Accumulator, AccumulatorParam, Logging} case class ColumnMetrics( elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty)) val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0) val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { logDebug(s"== ${child.simpleString} ==") logDebug(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case(attr, metric) => val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}") logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount += 1 var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes += HashSet(value.getClass.getName) } i += 1 } currentRow } } } } } }
Example 2
Source File: Stage.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashSet import org.apache.spark._ import org.apache.spark.rdd.RDD import org.apache.spark.util.CallSite def newAttemptId(): Int = { val id = nextAttemptId nextAttemptId += 1 id } def attemptId: Int = nextAttemptId override final def hashCode(): Int = id override final def equals(other: Any): Boolean = other match { case stage: Stage => stage != null && stage.id == id case _ => false } }
Example 3
Source File: FeatureSelection.scala From aerosolve with Apache License 2.0 | 5 votes |
package com.airbnb.aerosolve.training import java.io.BufferedWriter import java.io.OutputStreamWriter import java.util import com.airbnb.aerosolve.core.{ModelRecord, ModelHeader, FeatureVector, Example} import com.airbnb.aerosolve.core.models.LinearModel import com.airbnb.aerosolve.core.util.Util import com.typesafe.config.Config import org.slf4j.{LoggerFactory, Logger} import org.apache.spark.SparkContext import org.apache.spark.SparkContext._ import org.apache.spark.rdd.RDD import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import scala.collection.mutable.ArrayBuffer import scala.collection.mutable.Buffer import scala.collection.JavaConversions._ import scala.collection.JavaConverters._ import scala.util.Random import scala.math.abs import org.apache.hadoop.fs.FileSystem import org.apache.hadoop.fs.Path import org.apache.hadoop.conf.Configuration import org.apache.hadoop.fs.Path object FeatureSelection { private final val log: Logger = LoggerFactory.getLogger("FeatureSelection") val allKey : (String, String) = ("$ALL", "$POS") // Given a RDD compute the pointwise mutual information between // the positive label and the discrete features. def pointwiseMutualInformation(examples : RDD[Example], config : Config, key : String, rankKey : String, posThreshold : Double, minPosCount : Double, newCrosses : Boolean) : RDD[((String, String), Double)] = { val pointwise = LinearRankerUtils.makePointwise(examples, config, key, rankKey) val features = pointwise .mapPartitions(part => { // The tuple2 is var, var | positive val output = scala.collection.mutable.HashMap[(String, String), (Double, Double)]() part.foreach(example =>{ val featureVector = example.example.get(0) val isPos = if (featureVector.floatFeatures.get(rankKey).asScala.head._2 > posThreshold) 1.0 else 0.0 val all : (Double, Double) = output.getOrElse(allKey, (0.0, 0.0)) output.put(allKey, (all._1 + 1.0, all._2 + 1.0 * isPos)) val features : Array[(String, String)] = LinearRankerUtils.getFeatures(featureVector) if (newCrosses) { for (i <- features) { for (j <- features) { if (i._1 < j._1) { val key = ("%s<NEW>%s".format(i._1, j._1), "%s<NEW>%s".format(i._2, j._2)) val x = output.getOrElse(key, (0.0, 0.0)) output.put(key, (x._1 + 1.0, x._2 + 1.0 * isPos)) } } } } for (feature <- features) { val x = output.getOrElse(feature, (0.0, 0.0)) output.put(feature, (x._1 + 1.0, x._2 + 1.0 * isPos)) } }) output.iterator }) .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2)) .filter(x => x._2._2 >= minPosCount) val allCount = features.filter(x => x._1.equals(allKey)).take(1).head features.map(x => { val prob = x._2._1 / allCount._2._1 val probPos = x._2._2 / allCount._2._2 (x._1, math.log(probPos / prob) / math.log(2.0)) }) } // Returns the maximum entropy per family def maxEntropy(input : RDD[((String, String), Double)]) : RDD[((String, String), Double)] = { input .map(x => (x._1._1, (x._1._2, x._2))) .reduceByKey((a, b) => if (math.abs(a._2) > math.abs(b._2)) a else b) .map(x => ((x._1, x._2._1), x._2._2)) } }
Example 4
Source File: package.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.sql.execution import scala.collection.mutable.HashSet import org.apache.spark.rdd.RDD import org.apache.spark.sql._ import org.apache.spark.sql.catalyst.InternalRow import org.apache.spark.sql.catalyst.expressions.Attribute import org.apache.spark.sql.catalyst.trees.TreeNodeRef import org.apache.spark.{Accumulator, AccumulatorParam, Logging} case class ColumnMetrics( elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty)) val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0) val numColumns: Int = child.output.size val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics()) def dumpStats(): Unit = { logDebug(s"== ${child.simpleString} ==") logDebug(s"Tuples output: ${tupleCount.value}") child.output.zip(columnStats).foreach { case(attr, metric) => val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}") logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes") } } protected override def doExecute(): RDD[InternalRow] = { child.execute().mapPartitions { iter => new Iterator[InternalRow] { def hasNext: Boolean = iter.hasNext def next(): InternalRow = { val currentRow = iter.next() tupleCount += 1 var i = 0 while (i < numColumns) { val value = currentRow.get(i, output(i).dataType) if (value != null) { columnStats(i).elementTypes += HashSet(value.getClass.getName) } i += 1 } currentRow } } } } } }
Example 5
Source File: JobSet.scala From spark1.52 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import scala.util.Failure import org.apache.spark.streaming.Time import org.apache.spark.util.Utils private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() //当这jobset提交 private val submissionTime = System.currentTimeMillis() // when this jobset was submitted //当这jobset第一份工作开始处理 private var processingStartTime = -1L // when the first job of this jobset started processing //当这jobset最后的工作处理完 private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing //从他们开始处理的时间来处理所有的工作 // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted //从提交的时间来处理所有的工作时间 // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = { processingEndTime - time.milliseconds } def toBatchInfo: BatchInfo = { val failureReasons: Map[Int, String] = { if (hasCompleted) { jobs.filter(_.result.isFailure).map { job => (job.outputOpId, Utils.exceptionString(job.result.asInstanceOf[Failure[_]].exception)) }.toMap } else { Map.empty } } val binfo = new BatchInfo( time, streamIdToInputInfo, submissionTime, if (processingStartTime >= 0) Some(processingStartTime) else None, if (processingEndTime >= 0) Some(processingEndTime) else None ) binfo.setFailureReason(failureReasons) binfo.setNumOutputOp(jobs.size) binfo } }
Example 6
Source File: LocalKMeans.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers(i) val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData val points = new HashSet[Vector[Double]] val kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println(s"Initial centers: $kPoints") while(tempDist > convergeDist) { val closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) val mappings = closest.groupBy[Int] (x => x._1) val pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints(mapping._1), mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println(s"Final centers: $kPoints") } } // scalastyle:on println
Example 7
Source File: LocalityPlacementStrategySuite.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.deploy.yarn import scala.collection.JavaConverters._ import scala.collection.mutable.{HashMap, HashSet, Set} import org.apache.hadoop.yarn.api.records._ import org.apache.hadoop.yarn.conf.YarnConfiguration import org.mockito.Mockito._ import org.apache.spark.{SparkConf, SparkFunSuite} class LocalityPlacementStrategySuite extends SparkFunSuite { test("handle large number of containers and tasks (SPARK-18750)") { // Run the test in a thread with a small stack size, since the original issue // surfaced as a StackOverflowError. var error: Throwable = null val runnable = new Runnable() { override def run(): Unit = try { runTest() } catch { case e: Throwable => error = e } } val thread = new Thread(new ThreadGroup("test"), runnable, "test-thread", 32 * 1024) thread.start() thread.join() assert(error === null) } private def runTest(): Unit = { val yarnConf = new YarnConfiguration() // The numbers below have been chosen to balance being large enough to replicate the // original issue while not taking too long to run when the issue is fixed. The main // goal is to create enough requests for localized containers (so there should be many // tasks on several hosts that have no allocated containers). val resource = Resource.newInstance(8 * 1024, 4) val strategy = new LocalityPreferredContainerPlacementStrategy(new SparkConf(), yarnConf, resource, new MockResolver()) val totalTasks = 32 * 1024 val totalContainers = totalTasks / 16 val totalHosts = totalContainers / 16 val mockId = mock(classOf[ContainerId]) val hosts = (1 to totalHosts).map { i => (s"host_$i", totalTasks % i) }.toMap val containers = (1 to totalContainers).map { i => mockId } val count = containers.size / hosts.size / 2 val hostToContainerMap = new HashMap[String, Set[ContainerId]]() hosts.keys.take(hosts.size / 2).zipWithIndex.foreach { case (host, i) => val hostContainers = new HashSet[ContainerId]() containers.drop(count * i).take(i).foreach { c => hostContainers += c } hostToContainerMap(host) = hostContainers } strategy.localityOfRequestedContainers(containers.size * 2, totalTasks, hosts, hostToContainerMap, Nil) } }
Example 8
Source File: JobSet.scala From Spark-2.3.1 with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = processingEndTime - time.milliseconds def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (hasStarted) Some(processingStartTime) else None, if (hasCompleted) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 9
Source File: LocalKMeans.scala From BigDatalog with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 10
Source File: CachedRDDManager.scala From BigDatalog with Apache License 2.0 | 5 votes |
package edu.ucla.cs.wis.bigdatalog.spark.execution.recursion import org.apache.spark.Logging import org.apache.spark.rdd.RDD import org.apache.spark.storage.StorageLevel import scala.collection.mutable.{HashMap, HashSet, Set} class CachedRDDManager(defaultStorageLevel: StorageLevel) extends Logging with Serializable { val iterationToRDDMap = new HashMap[Int, HashSet[RDD[_]]] var currentIteration : Int = 0 def persist(rdd: RDD[_]): Unit = { persist(rdd, false) } def persist(rdd: RDD[_], doMemoryCheckpoint: Boolean): Unit = { iterationToRDDMap.getOrElseUpdate(currentIteration, new HashSet[RDD[_]]).add(rdd) rdd.persist(defaultStorageLevel) if (doMemoryCheckpoint) rdd.memoryCheckpoint() } def cleanUpIteration(iterationsBackToRemove: Int = 2) = { val start = System.currentTimeMillis() if (currentIteration >= iterationsBackToRemove) { val iterationId = currentIteration - iterationsBackToRemove if (iterationToRDDMap.contains(iterationId)) { val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get if (rdds.nonEmpty) logInfo("Unpersisting "+rdds.size+" rdds for iteration " + iterationId) rdds.foreach(rdd => rdd.unpersist(false)) } } logInfo("CleanUpIteration took " + (System.currentTimeMillis() - start) + " ms") currentIteration += 1 } def cleanUpIterationById(iterationId: Int) = { if (iterationToRDDMap.contains(iterationId)) { val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get rdds.foreach(rdd => rdd.unpersist(false)) } } def incrementIteration() { currentIteration += 1} def clear() = { iterationToRDDMap.clear() } def clear(remainCached: Seq[RDD[_]]) = { iterationToRDDMap.keySet.foreach(key => logInfo("key: " + key + " value: " + iterationToRDDMap.get(key))) iterationToRDDMap.keySet .foreach(key => iterationToRDDMap.get(key) .foreach(value => value.foreach(item => {if (!remainCached.contains(item)) item.unpersist(false)}))) iterationToRDDMap.clear() } def unpersist(rdds: Set[RDD[_]]) = { for (rdd <- rdds) { iterationToRDDMap.synchronized { // rdd should only be in 1 iteration val iterations = iterationToRDDMap.filter(x => x._2.contains(rdd)) if (iterations.nonEmpty) { val iteration = iterations.head iteration._2.remove(rdd) rdd.unpersist(false) if (iteration._2.isEmpty) iterationToRDDMap.remove(iteration._1) } } } } override def toString = { val output = new StringBuilder iterationToRDDMap.keySet.toSeq.sorted .foreach(iteration => { val rdds = iterationToRDDMap.get(iteration) rdds.foreach(rdd => output.append(iteration + ":" + rdd + "\n")) }) output.toString() } }
Example 11
Source File: JobSet.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToNumRecords: Map[Int, Long] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = { processingEndTime - time.milliseconds } def toBatchInfo: BatchInfo = { new BatchInfo( time, streamIdToNumRecords, submissionTime, if (processingStartTime >= 0 ) Some(processingStartTime) else None, if (processingEndTime >= 0 ) Some(processingEndTime) else None ) } }
Example 12
Source File: JobSet.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import scala.util.Failure import org.apache.spark.streaming.Time import org.apache.spark.util.Utils private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = { processingEndTime - time.milliseconds } def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (processingStartTime >= 0) Some(processingStartTime) else None, if (processingEndTime >= 0) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 13
Source File: FixedPointJobDefinition.scala From BigDatalog with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler.fixedpoint import org.apache.spark.TaskContext import org.apache.spark.rdd.RDD import scala.collection.mutable.{HashSet, HashMap, Set} class FixedPointJobDefinition(val setupIteration: (FixedPointJobDefinition, RDD[_]) => RDD[_], val cleanupIteration: (Int) => Unit) { var _fixedPointEvaluator: (TaskContext, Iterator[_]) => Boolean = null var finalRDD: RDD[_] = null var rddIds = Array.empty[Int] // for all and delta rdd id for FixedPointResultTask execution on worker def fixedPointEvaluator(fixedPointEvaluator: (TaskContext, Iterator[_]) => Boolean) = { _fixedPointEvaluator = fixedPointEvaluator } def getfixedPointEvaluator = _fixedPointEvaluator.asInstanceOf[(TaskContext, Iterator[_]) => _] def getFinalRDD: RDD[_] = finalRDD def setRDDIds(newAllRDDId: Int, oldAllRDDId: Int, newDeltaPrimeRDDId: Int, oldDeltaPrimeRDDId: Int): Unit = { rddIds = Array(newAllRDDId, oldAllRDDId, newDeltaPrimeRDDId, oldDeltaPrimeRDDId) } }
Example 14
Source File: LocalKMeans.scala From learning-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData = { def generatePoint(i: Int) = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } }
Example 15
Source File: HogHBaseReputation.scala From hogzilla with GNU General Public License v2.0 | 5 votes |
package org.hogzilla.hbase import scala.math.random import java.lang.Math import org.apache.spark._ import org.apache.hadoop.hbase.client.HBaseAdmin import org.apache.hadoop.hbase.util.Bytes import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName} import org.apache.hadoop.hbase.mapreduce.TableInputFormat import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD} import org.apache.spark.mllib.linalg.Vectors import org.apache.spark.rdd.RDD import org.apache.hadoop.hbase.client.HTable import org.apache.hadoop.hbase.filter.SingleColumnValueFilter import org.apache.hadoop.hbase.filter.BinaryComparator import org.apache.hadoop.hbase.filter.FilterList import org.apache.hadoop.hbase.filter.CompareFilter import java.util.ArrayList import org.apache.hadoop.hbase.client.Scan import org.apache.hadoop.hbase.filter.Filter import scala.collection.mutable.HashSet import org.apache.hadoop.hbase.client.Put object HogHBaseReputation { // Ex: MX, whitelist def getReputationList(listName:String, listType:String):Set[String] = { val list = new HashSet[String] val filters: ArrayList[Filter] = new ArrayList(); val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType))) colValFilter1.setFilterIfMissing(false); val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"), CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName))) colValFilter2.setFilterIfMissing(false); filters.add(colValFilter1); filters.add(colValFilter2); val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters); val scan = new Scan() scan.setFilter(filterList) val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator() while(it.hasNext()) { list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) ) } list.toSet } def saveReputationList(listName:String, listType:String, ip:String) = { val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName)) put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip)) HogHBaseRDD.hogzilla_reputation.put(put) } }
Example 16
Source File: HogConfig.scala From hogzilla with GNU General Public License v2.0 | 5 votes |
package org.hogzilla.util import java.security.MessageDigest import org.apache.hadoop.hbase.util.Bytes import javax.xml.bind.DatatypeConverter import math._ import com.typesafe.config.Config import scala.collection.mutable.HashSet object HogConfig { def get(config:Config,key:String,valueType:String,default:Any):Any = { if(config==null) return default try { val value = config.getString(key) if(value.isEmpty()) return default // Return default value println(f"Configuration: $key => $value") if(valueType.equals("Int")) value.toInt else if(valueType.equals("Double")) value.toDouble else if(valueType.equals("Long")) value.toLong else if(valueType.equals("Set(Int)")) { val patternSet="Set\\(".r val patternSetEnd="\\)".r if(value.equals("Set()")) return Set() return (patternSetEnd replaceAllIn((patternSet replaceAllIn(value, "")),"")) .split(",").map({x => x.toInt}).toSet } else if(valueType.equals("Set(String)")) { val patternSet="Set\\(".r val patternSetEnd="\\)".r if(value.equals("Set()")) return Set() return (patternSetEnd replaceAllIn((patternSet replaceAllIn(value, "")),"")) .split(",").map({x => println(x.toString.trim()) ; x.toString.trim()}).toSet } else default // Create type first } catch { case t: Throwable => t.printStackTrace() println(f"Problem parsing $key . Check if it is ok. Using default value") return default } } def getInt(config:Config,key:String,default:Any):Int = { get(config,key,"Int",default).asInstanceOf[Int] } def getLong(config:Config,key:String,default:Any):Long = { get(config,key,"Long",default).asInstanceOf[Long] } def getDouble(config:Config,key:String,default:Any):Double = { get(config,key,"Double",default).asInstanceOf[Long] } def getSetInt(config:Config,key:String,default:Any):Set[Int] = { get(config,key,"Set(Int)",default).asInstanceOf[Set[Int]] } def getSetString(config:Config,key:String,default:Any):Set[String] = { get(config,key,"Set(String)",default).asInstanceOf[Set[String]] } }
Example 17
Source File: StopwordDict.scala From berkeley-doc-summarizer with GNU General Public License v3.0 | 5 votes |
package edu.berkeley.nlp.summ.data import scala.collection.mutable.HashSet object StopwordDict { // N.B. This set was extracted from the RST treebank (train and test) mostly to reproduce // Hirao's results; it shouldn't really be used for other things val stopwords = Set("!", "", "#", "$", "%", "&", "'", "''", "'S", "'s", "()", ",", "-", "--", "-owned", ".", "", ":", ";", "<", "?", "", "A", "A.", "", "AND", "After", "All", "Am", "An", "And", "Any", "As", "At", "BE", "Between", "Both", "But", "By", "Each", "Few", "For", "From", "Had", "He", "Here", "How", "I", "If", "In", "Is", "It", "Its", "MORE", "More", "Most", "NO", "No", "No.", "Not", "OF", "Of", "On", "One", "Only", "Or", "Other", "Our", "Over", "She", "So", "Some", "Such", "THE", "Than", "That", "The", "Their", "Then", "There", "These", "They", "Those", "To", "UPS", "Under", "Until", "WHY", "We", "What", "When", "While", "Why", "Would", "You", "`It", "``", "a", "about", "above", "after", "again", "again.", "", "against", "all", "am", "an", "and", "any", "as", "at", "be", "been", "being", "below", "between", "both", "but", "by", "ca", "can", "could", "did", "do", "doing", "down", "each", "few", "for", "from", "further", "had", "have", "having", "he", "her", "here", "herself", "him", "him.", "", "himself", "how", "if", "in", "into", "is", "it", "its", "itself", "let", "lets", "me", "more", "most", "must", "my", "n't", "no", "nor", "not", "of", "off", "on", "one", "ones", "only", "or", "other", "others", "ought", "our", "out", "over", "own", "owned", "owns", "same", "she", "should", "so", "some", "such", "than", "that", "the", "their", "them", "then", "there", "these", "they", "those", "through", "to", "too", "under", "until", "up", "very", "we", "were", "what", "when", "where", "which", "while", "who", "whom", "why", "with", "wo", "would", "you", "your", "yourself", "{", "}") // Leave $ in there val stopwordTags = new HashSet[String] ++ Array("CC", "DT", "EX", "IN", "LS", "MD", "PDT", "POS", "PRN", "PRP", "PRP$", "RP", "SYM", "TO", "WDT", "WP", "WP$", "WRB", ".", ",", "``", "''", ";", ":", "-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-") }
Example 18
Source File: DepParseDoc.scala From berkeley-doc-summarizer with GNU General Public License v3.0 | 5 votes |
package edu.berkeley.nlp.summ.data import scala.collection.mutable.HashSet import scala.collection.mutable.ArrayBuffer trait DepParseDoc extends Serializable { def name: String def doc: Seq[DepParse] def summary: Seq[DepParse] override def toString() = { toString(Int.MaxValue) } def toString(maxNumSentences: Int) = { "DOCUMENT:\n" + doc.map(_.getWords.reduce(_ + " " + _)).slice(0, Math.min(maxNumSentences, doc.size)).reduce(_ + "\n" + _) + "\nSUMMARY:\n" + summary.map(_.getWords.reduce(_ + " " + _)).slice(0, Math.min(maxNumSentences, doc.size)).reduce(_ + "\n" + _) } }
Example 19
package com.astrolabsoftware.spark3d.spatialOperator import com.astrolabsoftware.spark3d.geometryObjects.Point3D import com.astrolabsoftware.spark3d.geometryObjects.Shape3D.Shape3D import com.astrolabsoftware.spark3d.utils.GeometryObjectComparator import com.astrolabsoftware.spark3d.utils.Utils.takeOrdered import com.astrolabsoftware.spark3d.spatialPartitioning._ import org.apache.spark.rdd.RDD import scala.collection.mutable.{HashSet, ListBuffer} import scala.collection.JavaConverters._ import scala.reflect.ClassTag import scala.util.control.Breaks._ object KNN { def KNNStandard[T <: Shape3D: ClassTag]( rdd: RDD[T], queryObject: T, k: Int, unique: Boolean = false): List[T] = { val knn = takeOrdered[T](rdd, k, queryObject, unique)( new GeometryObjectComparator[T](queryObject.center) ) knn.toList } }
Example 20
Source File: Rule.scala From jigg with Apache License 2.0 | 5 votes |
package jigg.nlp.ccg.parser import jigg.nlp.ccg.lexicon.{Category, Derivation, Point, UnaryChildPoint, BinaryChildrenPoints, AppliedRule} import scala.collection.mutable.{HashMap, HashSet} import java.io.{ObjectOutputStream, ObjectInputStream} trait Rule { def unify(left:Category, right:Category): Option[Array[(Category, String)]] def raise(child:Category): Option[Array[(Category, String)]] def headFinder:HeadFinder } // rules are restricted to CFG rules extracted from the training CCGBank case class CFGRule(val binaryRules:Map[(Int,Int), Array[(Category, String)]], // category ids -> (category, ruleType) val unaryRules:Map[Int, Array[(Category, String)]], override val headFinder:HeadFinder) extends Rule { def unify(left:Category, right:Category):Option[Array[(Category, String)]] = binaryRules.get((left.id, right.id)) def raise(child:Category):Option[Array[(Category, String)]] = unaryRules.get(child.id) } object CFGRule { def extractRulesFromDerivations(derivations: Array[Derivation], headFinder:HeadFinder): CFGRule = { val binaryRules = new HashMap[(Int, Int), HashSet[(Category, String)]] val unaryRules = new HashMap[Int, HashSet[(Category, String)]] derivations.foreach { deriv => deriv.foreachPoint({ point:Point => deriv.get(point) match { case Some(AppliedRule(UnaryChildPoint(child), ruleType)) => val parents = unaryRules.getOrElseUpdate(child.category.id, new HashSet[(Category, String)]) parents += ((point.category, ruleType)) case Some(AppliedRule(BinaryChildrenPoints(left, right), ruleType)) => val parents = binaryRules.getOrElseUpdate((left.category.id, right.category.id), new HashSet[(Category, String)]) parents += ((point.category, ruleType)) case _ => }}) } new CFGRule(binaryRules.map { case (k, v) => k -> v.toArray }.toMap, unaryRules.map { case (k, v) => k -> v.toArray }.toMap, headFinder) } }
Example 21
Source File: BytecodeUtils.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.graphx.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import scala.collection.mutable.HashSet import scala.language.existentials import org.apache.xbean.asm5.{ClassReader, ClassVisitor, MethodVisitor} import org.apache.xbean.asm5.Opcodes._ import org.apache.spark.util.Utils private class MethodInvocationFinder(className: String, methodName: String) extends ClassVisitor(ASM5) { val methodsInvoked = new HashSet[(Class[_], String)] override def visitMethod(access: Int, name: String, desc: String, sig: String, exceptions: Array[String]): MethodVisitor = { if (name == methodName) { new MethodVisitor(ASM5) { override def visitMethodInsn( op: Int, owner: String, name: String, desc: String, itf: Boolean) { if (op == INVOKEVIRTUAL || op == INVOKESPECIAL || op == INVOKESTATIC) { if (!skipClass(owner)) { methodsInvoked.add((Utils.classForName(owner.replace("/", ".")), name)) } } } } } else { null } } } }
Example 22
Source File: JobSet.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = processingEndTime - time.milliseconds def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (hasStarted) Some(processingStartTime) else None, if (hasCompleted) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 23
Source File: TaskDescription.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import java.nio.ByteBuffer import scala.collection.mutable import scala.collection.mutable.HashSet import scala.util.control.NonFatal import org.apache.spark._ import org.apache.spark.internal.Logging import org.apache.spark.serializer.SerializerInstance import org.apache.spark.util.SerializableBuffer private[spark] class TaskDescription( val taskId: Long, val attemptNumber: Int, val executorId: String, val name: String, val index: Int, // Index within this task's TaskSet val isFutureTask: Boolean, @transient private val _task: Task[_], @transient private val _addedFiles: mutable.Map[String, Long], @transient private val _addedJars: mutable.Map[String, Long], @transient private val _ser: SerializerInstance) extends Serializable with Logging { // Because ByteBuffers are not serializable, wrap the task in a SerializableBuffer private var buffer: SerializableBuffer = _ def prepareSerializedTask(): Unit = { if (_task != null) { val serializedTask: ByteBuffer = try { Task.serializeWithDependencies(_task, _addedFiles, _addedJars, _ser) } catch { // If the task cannot be serialized, then there is not point in re-attempting // the task as it will always fail. So just abort the task set. case NonFatal(e) => val msg = s"Failed to serialize the task $taskId, not attempting to retry it." logError(msg, e) // FIXME(shivaram): We dont have a handle to the taskSet here to abort it. throw new TaskNotSerializableException(e) } if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024) { logWarning(s"Stage ${_task.stageId} contains a task of very large size " + s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " + s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.") } buffer = new SerializableBuffer(serializedTask) } else { buffer = new SerializableBuffer(ByteBuffer.allocate(0)) } } def serializedTask: ByteBuffer = buffer.value override def toString: String = "TaskDescription(TID=%d, index=%d)".format(taskId, index) }
Example 24
Source File: FutureTaskWaiter.scala From drizzle-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.scheduler import scala.collection.mutable.HashSet import org.apache.spark.internal.Logging import org.apache.spark.MapOutputTracker import org.apache.spark.SparkConf import org.apache.spark.storage.BlockManager import org.apache.spark.storage.ShuffleBlockId import org.apache.spark.util.TimeStampedHashMap private[spark] case class FutureTaskInfo(shuffleId: Int, numMaps: Int, reduceId: Int, taskId: Long, nonZeroPartitions: Option[Array[Int]], taskCb: () => Unit) private[spark] class FutureTaskWaiter( conf: SparkConf, blockManager: BlockManager, mapOutputTracker: MapOutputTracker) extends Logging { // Key is (shuffleId, reduceId) private val futureTaskInfo = new TimeStampedHashMap[(Int, Int), FutureTaskInfo] // Key is (shuffleId, reduceId), value is the set of blockIds we are waiting for private val futureTasksBlockWait = new TimeStampedHashMap[(Int, Int), HashSet[Int]] def submitFutureTask(info: FutureTaskInfo) { futureTasksBlockWait.synchronized { val blocksToWaitFor = if (info.nonZeroPartitions.isDefined) { info.nonZeroPartitions.get.toSet } else { (0 until info.numMaps).toArray.toSet } // Check if all the blocks already exist. If so just trigger taskCb // Count how many outputs have been registered with the MapOutputTracker for this shuffle // and intersect with blocksToWaitFor to only get how many for this reduce are available val availableBlocks = mapOutputTracker.getAvailableMapOutputs(info.shuffleId).intersect(blocksToWaitFor) val mapsToWait = blocksToWaitFor.size val numMapsPending = blocksToWaitFor.size - availableBlocks.size if (availableBlocks.size >= mapsToWait) { info.taskCb() } else { futureTaskInfo.put((info.shuffleId, info.reduceId), info) // NOTE: Its fine not to synchronize here as two future tasks shouldn't be submitted at the // same time Calculate the number of blocks to wait for before starting future task val waitForBlocks = blocksToWaitFor.diff(availableBlocks) futureTasksBlockWait.put( (info.shuffleId, info.reduceId), new HashSet[Int]() ++ waitForBlocks) } } } def shuffleBlockReady(shuffleBlockId: ShuffleBlockId): Unit = { val key = (shuffleBlockId.shuffleId, shuffleBlockId.reduceId) futureTasksBlockWait.synchronized { if (futureTaskInfo.contains(key)) { if (futureTasksBlockWait.contains(key)) { futureTasksBlockWait(key) -= shuffleBlockId.mapId // If we have all the blocks, run the CB if (futureTasksBlockWait(key).size <= 0) { val cb = futureTaskInfo(key).taskCb futureTasksBlockWait.remove(key) futureTaskInfo.remove(key) cb() } } } } } def addMapStatusAvailable(shuffleId: Int, mapId: Int, numReduces: Int, mapStatus: MapStatus) { // NOTE: This should be done before we trigger future tasks. mapOutputTracker.addStatus(shuffleId, mapId, mapStatus) futureTasksBlockWait.synchronized { // Register the output for each reduce task. (0 until numReduces).foreach { reduceId => shuffleBlockReady(new ShuffleBlockId(shuffleId, mapId, reduceId)) } } } }
Example 25
Source File: SurfaceFormDictionary.scala From dbpedia-spotlight-model with Apache License 2.0 | 5 votes |
package org.dbpedia.spotlight.spot.opennlp import java.io.File import org.dbpedia.spotlight.util.bloomfilter.LongFastBloomFilter import scala.collection.mutable.HashSet object SurfaceFormDictionary { def fromIterator(entries: scala.collection.Iterator[String], surfaceformDictionary: SurfaceFormDictionary = new ExactSurfaceFormDictionary()) : SurfaceFormDictionary = { entries.foreach(line => surfaceformDictionary.add(line)) surfaceformDictionary } } object ProbabilisticSurfaceFormDictionary { def fromFile(dictionaryFile: File, caseSensitive: Boolean = true) : SurfaceFormDictionary = { SurfaceFormDictionary.fromIterator(io.Source.fromFile(dictionaryFile).getLines(), new ProbabilisticSurfaceFormDictionary(io.Source.fromFile(dictionaryFile).size, caseSensitive)) } } object ExactSurfaceFormDictionary { def fromFile(dictionaryFile: File, caseSensitive: Boolean = true) : SurfaceFormDictionary = { SurfaceFormDictionary.fromIterator(io.Source.fromFile(dictionaryFile).getLines(), new ExactSurfaceFormDictionary(caseSensitive)) } }
Example 26
Source File: Flows.scala From spatial with MIT License | 5 votes |
package argon import scala.collection.mutable.{ArrayBuffer,HashSet} import utils.Instrument trait FlowRules { val IR: State } class Flows { private var rules = ArrayBuffer[(String,PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit])]() private[argon] var names = HashSet[String]() lazy val instrument = new Instrument("flows") def prepend(name: String, func: PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit]): Unit = { rules.prepend((name,func)) names += name } def add(name: String, func: PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit]): Unit = { rules += ((name,func)) names += name } def remove(name: String): Unit = { val idx = rules.indexWhere(_._1 == name) rules.remove(idx) names.remove(name) } def apply[A](lhs: Sym[A], rhs: Op[A])(implicit ctx: SrcCtx, state: State): Unit = { val tuple = (lhs,rhs,ctx,state) rules.foreach{case (name,rule) => if (rule.isDefinedAt(tuple)) { instrument(name){ rule.apply(tuple) } } } } def save(): Flows = { val flows = new Flows flows.rules ++= rules flows.names ++= names flows } def restore(flow: Flows): Unit = { rules = flow.rules names = flow.names } }
Example 27
Source File: IdentifyFeyActors.scala From incubator-retired-iota with Apache License 2.0 | 5 votes |
package org.apache.iota.fey import akka.actor.{Actor, ActorIdentity, ActorLogging, ActorPath, Identify} import akka.routing.{ActorRefRoutee, GetRoutees, Routees} import play.api.libs.json._ import scala.collection.mutable.HashSet protected class IdentifyFeyActors extends Actor with ActorLogging { import IdentifyFeyActors._ override def receive: Receive = { case IDENTIFY_TREE(startPath) => log.info("Current Actors in system:") actorsPath = HashSet.empty rootPath = startPath log.info(startPath) self ! ActorPath.fromString(startPath) case path: ActorPath => context.actorSelection(path / "*") ! Identify(()) context.actorSelection(path / "*") ! GetRoutees case ActorIdentity(_, Some(ref)) => actorsPath.add(ref.path.toString) log.info(ref.path.toString) self ! ref.path case routees:Routees => routees.routees .map(_.asInstanceOf[ActorRefRoutee]) .foreach(routee => { log.info(routee.ref.path.toString) actorsPath.add(routee.ref.path.toString) }) case _ => } } protected object IdentifyFeyActors{ def generateTreeJson(): String = { val trie = new Trie("FEY-MANAGEMENT-SYSTEM") actorsPath.map(_.replace("user/","")).foreach(trie.append(_)) Json.stringify(trie.print) } //Static HTML content from d3 val html = scala.io.Source.fromInputStream(getClass.getResourceAsStream("/d3Tree.html"), "UTF-8") .getLines() .mkString("\n") def getHTMLTree(json: String): String = { html.replace("$MYJSONHIERARCHY", json) } }
Example 28
Source File: BytecodeUtils.scala From graphx-algorithm with GNU General Public License v2.0 | 5 votes |
package org.apache.spark.graphx.util import java.io.{ByteArrayInputStream, ByteArrayOutputStream} import scala.collection.mutable.HashSet import scala.language.existentials import org.apache.spark.util.Utils import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.{ClassReader, ClassVisitor, MethodVisitor} import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.Opcodes._ private class MethodInvocationFinder(className: String, methodName: String) extends ClassVisitor(ASM4) { val methodsInvoked = new HashSet[(Class[_], String)] override def visitMethod(access: Int, name: String, desc: String, sig: String, exceptions: Array[String]): MethodVisitor = { if (name == methodName) { new MethodVisitor(ASM4) { override def visitMethodInsn(op: Int, owner: String, name: String, desc: String) { if (op == INVOKEVIRTUAL || op == INVOKESPECIAL || op == INVOKESTATIC) { if (!skipClass(owner)) { methodsInvoked.add((Class.forName(owner.replace("/", ".")), name)) } } } } } else { null } } } }
Example 29
Source File: AllowRule.scala From Hive-JDBC-Proxy with Apache License 2.0 | 5 votes |
package com.enjoyyin.hive.proxy.jdbc.rule import com.enjoyyin.hive.proxy.jdbc.thrift.ProxySession import com.enjoyyin.hive.proxy.jdbc.domain.User import com.enjoyyin.hive.proxy.jdbc.thrift.EventInfo import com.enjoyyin.hive.proxy.jdbc.domain.UserHQL import com.enjoyyin.hive.proxy.jdbc.domain.ThriftServerName import com.enjoyyin.hive.proxy.jdbc.util.ProxyConf import java.util.HashMap import scala.collection.JavaConversions._ import scala.collection.mutable.HashSet import com.enjoyyin.hive.proxy.jdbc.rule.basic.DefaultThriftServerNameRule import com.enjoyyin.hive.proxy.jdbc.util.Logging import com.enjoyyin.hive.proxy.jdbc.domain.HQLPriority import com.enjoyyin.hive.proxy.jdbc.rule.basic.BalancerInfo override def dealOrNot(params: Map[String, String]): ThriftServerName def canDeal(params: Map[String, String]): Boolean } object ThriftServerNameRule extends Logging{ val THRIFT_CONNECTION_NAME = ProxyConf.THRIFT_CONNECTION_NAME val USERNAME_NAME = "username" val IPADDRESS_NAME = "ipAddress" type JMap[K, V] = java.util.Map[K, V] private val registeredRules: HashSet[ThriftServerNameRule] = HashSet[ThriftServerNameRule]() private def toParamsMap(conf: JMap[String, String], username: String, ipAddress: String): Map[String, String] = { var params = conf if(conf == null) { params = new HashMap[String, String] } params += USERNAME_NAME -> username params += IPADDRESS_NAME -> ipAddress params.toMap } private def register(ruleName: String): Unit = { val ruleClass = Class.forName(ruleName).newInstance.asInstanceOf[ThriftServerNameRule] registeredRules.synchronized(registeredRules += ruleClass) logInfo("Registered a thrift-server-name-rule " + ruleName) } def register(ruleNames: Array[String]): Unit = { if(ruleNames.isEmpty) return registeredRules.synchronized { registeredRules.clear ruleNames.foreach(register) } } def getThriftServerName(conf: JMap[String, String], username: String, ipAddress: String): ThriftServerName = { val params = toParamsMap(conf, username, ipAddress) var rule = registeredRules.synchronized(registeredRules.find(_.canDeal(params))) if(rule.isEmpty) { rule = Some(DefaultThriftServerNameRule) } rule.get.dealOrNot(params) } }
Example 30
Source File: LocalKMeans.scala From drizzle-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 31
Source File: CategoryFeatureTest.scala From jigg with Apache License 2.0 | 5 votes |
package jigg.nlp.ccg.lexicon import org.scalatest.FunSuite import org.scalatest.Matchers._ import scala.collection.mutable.HashSet class JPCategoryFeatureTest extends FunSuite { test("equal test") { val feat1 = JPCategoryFeature.createFromValues(List("adn","attr","ga")) val feat2 = JPCategoryFeature.createFromValues(List("nm","attr","ga")) val feat3 = JPCategoryFeature.createFromValues(List("adn","attr")) val feat4 = JPCategoryFeature.createFromValues(List("adn","attr","ga")) feat1.kvs should equal (feat4.kvs) feat1.kvs should not equal (feat2.kvs) feat1.kvs should not equal (feat3.kvs) } }
Example 32
Source File: LocalKMeans.scala From sparkoscope with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 33
Source File: JobSet.scala From sparkoscope with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = processingEndTime - time.milliseconds def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (hasStarted) Some(processingStartTime) else None, if (hasCompleted) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 34
Source File: BytecodeUtil.scala From sbt-jni with BSD 3-Clause "New" or "Revised" License | 5 votes |
package ch.jodersky.sbt.jni package util import java.io.{ File, FileInputStream, Closeable } import scala.collection.mutable.{ HashSet } import org.objectweb.asm.{ ClassReader, ClassVisitor, MethodVisitor, Opcodes } object BytecodeUtil { private class NativeFinder extends ClassVisitor(Opcodes.ASM5) { // classes found to contain at least one @native def val _nativeClasses = new HashSet[String] def nativeClasses = _nativeClasses.toSet private var fullyQualifiedName: String = "" override def visit(version: Int, access: Int, name: String, signature: String, superName: String, interfaces: Array[String]): Unit = { fullyQualifiedName = name.replaceAll("/", ".") } override def visitMethod(access: Int, name: String, desc: String, signature: String, exceptions: Array[String]): MethodVisitor = { val isNative = (access & Opcodes.ACC_NATIVE) != 0 if (isNative) { _nativeClasses += fullyQualifiedName } null //return null, do not visit method further } } private def using[A >: Null <: Closeable, R](mkStream: => A)(action: A => R): R = { var stream: A = null try { stream = mkStream action(stream) } finally { if (stream != null) { stream.close() } } } def nativeClasses(classFile: File): Set[String] = using(new FileInputStream(classFile)) { in => val reader = new ClassReader(in) val finder = new NativeFinder reader.accept(finder, 0) finder.nativeClasses } }
Example 35
Source File: Labels.scala From jgo with GNU General Public License v3.0 | 5 votes |
package jgo.tools.compiler package parser.stmts import parser.exprs._ import parser.scoped._ import parser.funcs._ import interm._ import types._ import symbol._ import codeseq._ import instr._ import scala.collection.mutable.{HashMap, HashSet, ListBuffer} import scala.{collection => coll} import coll.{immutable => imm} trait Labels { private val seenDefs = HashSet[String]() private val unseenDefs = HashMap[String, ListBuffer[Pos]]() private val lbls = HashMap[String, UserLabel]() def defLabel(name: String, pos: Pos): (String, Err[UserLabel]) = if (seenDefs contains name) (name, problem("label %s already defined", name)(pos)) else { seenDefs += name unseenDefs -= name val label = lbls getOrElseUpdate (name, new UserLabel(name)) (name, result(label)) } def useLabel(pos: Pos, name: String): UserLabel = { if (!(seenDefs contains name)) unseenDefs.getOrElseUpdate(name, new ListBuffer) += pos lbls getOrElseUpdate (name, new UserLabel(name)) } def procGoto(pos: Pos, name: String): Err[CodeBuilder] = { result(Goto(useLabel(pos, name))) } def checkForUndefedLabels: Err[Unit] = { var issues: Err[Unit] = result(()) for ((lblName, positions) <- unseenDefs; pos <- positions) { issues = issues then problem("target label not found: %s", lblName)(pos) } issues } }
Example 36
Source File: LocalKMeans.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
// scalastyle:off println package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{squaredDistance, DenseVector, Vector} object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D) {rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use org.apache.spark.ml.clustering.KMeans |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } } // scalastyle:on println
Example 37
Source File: JobSet.scala From multi-tenancy-spark with Apache License 2.0 | 5 votes |
package org.apache.spark.streaming.scheduler import scala.collection.mutable.HashSet import org.apache.spark.streaming.Time private[streaming] case class JobSet( time: Time, jobs: Seq[Job], streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) { private val incompleteJobs = new HashSet[Job]() private val submissionTime = System.currentTimeMillis() // when this jobset was submitted private var processingStartTime = -1L // when the first job of this jobset started processing private var processingEndTime = -1L // when the last job of this jobset finished processing jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) } incompleteJobs ++= jobs def handleJobStart(job: Job) { if (processingStartTime < 0) processingStartTime = System.currentTimeMillis() } def handleJobCompletion(job: Job) { incompleteJobs -= job if (hasCompleted) processingEndTime = System.currentTimeMillis() } def hasStarted: Boolean = processingStartTime > 0 def hasCompleted: Boolean = incompleteJobs.isEmpty // Time taken to process all the jobs from the time they started processing // (i.e. not including the time they wait in the streaming scheduler queue) def processingDelay: Long = processingEndTime - processingStartTime // Time taken to process all the jobs from the time they were submitted // (i.e. including the time they wait in the streaming scheduler queue) def totalDelay: Long = processingEndTime - time.milliseconds def toBatchInfo: BatchInfo = { BatchInfo( time, streamIdToInputInfo, submissionTime, if (hasStarted) Some(processingStartTime) else None, if (hasCompleted) Some(processingEndTime) else None, jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap ) } }
Example 38
Source File: LocalKMeans.scala From iolap with Apache License 2.0 | 5 votes |
package org.apache.spark.examples import java.util.Random import scala.collection.mutable.HashMap import scala.collection.mutable.HashSet import breeze.linalg.{Vector, DenseVector, squaredDistance} import org.apache.spark.SparkContext._ object LocalKMeans { val N = 1000 val R = 1000 // Scaling factor val D = 10 val K = 10 val convergeDist = 0.001 val rand = new Random(42) def generateData: Array[DenseVector[Double]] = { def generatePoint(i: Int): DenseVector[Double] = { DenseVector.fill(D){rand.nextDouble * R} } Array.tabulate(N)(generatePoint) } def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = { var index = 0 var bestIndex = 0 var closest = Double.PositiveInfinity for (i <- 1 to centers.size) { val vCurr = centers.get(i).get val tempDist = squaredDistance(p, vCurr) if (tempDist < closest) { closest = tempDist bestIndex = i } } bestIndex } def showWarning() { System.err.println( """WARN: This is a naive implementation of KMeans Clustering and is given as an example! |Please use the KMeans method found in org.apache.spark.mllib.clustering |for more conventional use. """.stripMargin) } def main(args: Array[String]) { showWarning() val data = generateData var points = new HashSet[Vector[Double]] var kPoints = new HashMap[Int, Vector[Double]] var tempDist = 1.0 while (points.size < K) { points.add(data(rand.nextInt(N))) } val iter = points.iterator for (i <- 1 to points.size) { kPoints.put(i, iter.next()) } println("Initial centers: " + kPoints) while(tempDist > convergeDist) { var closest = data.map (p => (closestPoint(p, kPoints), (p, 1))) var mappings = closest.groupBy[Int] (x => x._1) var pointStats = mappings.map { pair => pair._2.reduceLeft [(Int, (Vector[Double], Int))] { case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2)) } } var newPoints = pointStats.map {mapping => (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))} tempDist = 0.0 for (mapping <- newPoints) { tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2) } for (newP <- newPoints) { kPoints.put(newP._1, newP._2) } } println("Final centers: " + kPoints) } }