scala.collection.mutable.HashSet Scala Example

Source File: package.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import scala.collection.mutable.HashSet

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.{Accumulator, AccumulatorParam, Logging}


    case class ColumnMetrics(
        elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty))
    val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0)

    val numColumns: Int = child.output.size
    val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())

    def dumpStats(): Unit = {
      logDebug(s"== ${child.simpleString} ==")
      logDebug(s"Tuples output: ${tupleCount.value}")
      child.output.zip(columnStats).foreach { case(attr, metric) =>
        val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}")
        logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
      }
    }

    protected override def doExecute(): RDD[InternalRow] = {
      child.execute().mapPartitions { iter =>
        new Iterator[InternalRow] {
          def hasNext: Boolean = iter.hasNext
          def next(): InternalRow = {
            val currentRow = iter.next()
            tupleCount += 1
            var i = 0
            while (i < numColumns) {
              val value = currentRow.get(i, output(i).dataType)
              if (value != null) {
                columnStats(i).elementTypes += HashSet(value.getClass.getName)
              }
              i += 1
            }
            currentRow
          }
        }
      }
    }
  }
}

Source File: Stage.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashSet

import org.apache.spark._
import org.apache.spark.rdd.RDD
import org.apache.spark.util.CallSite


  def newAttemptId(): Int = {
    val id = nextAttemptId
    nextAttemptId += 1
    id
  }

  def attemptId: Int = nextAttemptId

  override final def hashCode(): Int = id
  override final def equals(other: Any): Boolean = other match {
    case stage: Stage => stage != null && stage.id == id
    case _ => false
  }
}

Source File: FeatureSelection.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.aerosolve.training

import java.io.BufferedWriter
import java.io.OutputStreamWriter
import java.util

import com.airbnb.aerosolve.core.{ModelRecord, ModelHeader, FeatureVector, Example}
import com.airbnb.aerosolve.core.models.LinearModel
import com.airbnb.aerosolve.core.util.Util
import com.typesafe.config.Config
import org.slf4j.{LoggerFactory, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.Buffer
import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
import scala.util.Random
import scala.math.abs
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

object FeatureSelection {
  private final val log: Logger = LoggerFactory.getLogger("FeatureSelection")
  val allKey : (String, String) = ("$ALL", "$POS")

  // Given a RDD compute the pointwise mutual information between
  // the positive label and the discrete features.
  def pointwiseMutualInformation(examples : RDD[Example],
                                 config : Config,
                                 key : String,
                                 rankKey : String,
                                 posThreshold : Double,
                                 minPosCount : Double,
                                 newCrosses : Boolean) : RDD[((String, String), Double)] = {
    val pointwise = LinearRankerUtils.makePointwise(examples, config, key, rankKey)
    val features = pointwise
      .mapPartitions(part => {
      // The tuple2 is var, var | positive
      val output = scala.collection.mutable.HashMap[(String, String), (Double, Double)]()
      part.foreach(example =>{
        val featureVector = example.example.get(0)
        val isPos = if (featureVector.floatFeatures.get(rankKey).asScala.head._2 > posThreshold) 1.0
        else 0.0
        val all : (Double, Double) = output.getOrElse(allKey, (0.0, 0.0))
        output.put(allKey, (all._1 + 1.0, all._2 + 1.0 * isPos))

        val features : Array[(String, String)] =
          LinearRankerUtils.getFeatures(featureVector)
        if (newCrosses) {
          for (i <- features) {
            for (j <- features) {
              if (i._1 < j._1) {
                val key = ("%s<NEW>%s".format(i._1, j._1),
                           "%s<NEW>%s".format(i._2, j._2))
                val x = output.getOrElse(key, (0.0, 0.0))
                output.put(key, (x._1 + 1.0, x._2 + 1.0 * isPos))
              }
            }
          }
        }
        for (feature <- features) {
          val x = output.getOrElse(feature, (0.0, 0.0))
          output.put(feature, (x._1 + 1.0, x._2 + 1.0 * isPos))
        }
      })
      output.iterator
    })
    .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2))
    .filter(x => x._2._2 >= minPosCount)

    val allCount = features.filter(x => x._1.equals(allKey)).take(1).head

    features.map(x => {
      val prob = x._2._1 / allCount._2._1
      val probPos = x._2._2 / allCount._2._2
      (x._1, math.log(probPos / prob) / math.log(2.0))
    })
  }

  // Returns the maximum entropy per family
  def maxEntropy(input : RDD[((String, String), Double)]) : RDD[((String, String), Double)] = {
    input
      .map(x => (x._1._1, (x._1._2, x._2)))
      .reduceByKey((a, b) => if (math.abs(a._2) > math.abs(b._2)) a else b)
      .map(x => ((x._1, x._2._1), x._2._2))
  }
}

Source File: package.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.sql.execution

import scala.collection.mutable.HashSet

import org.apache.spark.rdd.RDD
import org.apache.spark.sql._
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.catalyst.expressions.Attribute
import org.apache.spark.sql.catalyst.trees.TreeNodeRef
import org.apache.spark.{Accumulator, AccumulatorParam, Logging}


    case class ColumnMetrics(
        elementTypes: Accumulator[HashSet[String]] = sparkContext.accumulator(HashSet.empty))
    val tupleCount: Accumulator[Int] = sparkContext.accumulator[Int](0)

    val numColumns: Int = child.output.size
    val columnStats: Array[ColumnMetrics] = Array.fill(child.output.size)(new ColumnMetrics())

    def dumpStats(): Unit = {
      logDebug(s"== ${child.simpleString} ==")
      logDebug(s"Tuples output: ${tupleCount.value}")
      child.output.zip(columnStats).foreach { case(attr, metric) =>
        val actualDataTypes = metric.elementTypes.value.mkString("{", ",", "}")
        logDebug(s" ${attr.name} ${attr.dataType}: $actualDataTypes")
      }
    }

    protected override def doExecute(): RDD[InternalRow] = {
      child.execute().mapPartitions { iter =>
        new Iterator[InternalRow] {
          def hasNext: Boolean = iter.hasNext
          def next(): InternalRow = {
            val currentRow = iter.next()
            tupleCount += 1
            var i = 0
            while (i < numColumns) {
              val value = currentRow.get(i, output(i).dataType)
              if (value != null) {
                columnStats(i).elementTypes += HashSet(value.getClass.getName)
              }
              i += 1
            }
            currentRow
          }
        }
      }
    }
  }
}

Source File: JobSet.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.HashSet
import scala.util.Failure

import org.apache.spark.streaming.Time
import org.apache.spark.util.Utils


private[streaming]
case class JobSet(
    time: Time,
    jobs: Seq[Job],
    streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) {

  private val incompleteJobs = new HashSet[Job]()
  //当这jobset提交
  private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
  //当这jobset第一份工作开始处理
  private var processingStartTime = -1L // when the first job of this jobset started processing
  //当这jobset最后的工作处理完
  private var processingEndTime = -1L // when the last job of this jobset finished processing

  jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) }
  incompleteJobs ++= jobs

  def handleJobStart(job: Job) {
    if (processingStartTime < 0) processingStartTime = System.currentTimeMillis()
  }

  def handleJobCompletion(job: Job) {
    incompleteJobs -= job
    if (hasCompleted) processingEndTime = System.currentTimeMillis()
  }

  def hasStarted: Boolean = processingStartTime > 0

  def hasCompleted: Boolean = incompleteJobs.isEmpty

  // Time taken to process all the jobs from the time they started processing
  //从他们开始处理的时间来处理所有的工作
  // (i.e. not including the time they wait in the streaming scheduler queue)
  def processingDelay: Long = processingEndTime - processingStartTime

  // Time taken to process all the jobs from the time they were submitted
  //从提交的时间来处理所有的工作时间
  // (i.e. including the time they wait in the streaming scheduler queue)
  def totalDelay: Long = {
    processingEndTime - time.milliseconds
  }

  def toBatchInfo: BatchInfo = {
    val failureReasons: Map[Int, String] = {
      if (hasCompleted) {
        jobs.filter(_.result.isFailure).map { job =>
          (job.outputOpId, Utils.exceptionString(job.result.asInstanceOf[Failure[_]].exception))
        }.toMap
      } else {
        Map.empty
      }
    }
    val binfo = new BatchInfo(
      time,
      streamIdToInputInfo,
      submissionTime,
      if (processingStartTime >= 0) Some(processingStartTime) else None,
      if (processingEndTime >= 0) Some(processingEndTime) else None
    )
    binfo.setFailureReason(failureReasons)
    binfo.setNumOutputOp(jobs.size)
    binfo
  }
}

Source File: LocalKMeans.scala From Spark-2.3.1 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers(i)
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    val points = new HashSet[Vector[Double]]
    val kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println(s"Initial centers: $kPoints")

    while(tempDist > convergeDist) {
      val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      val mappings = closest.groupBy[Int] (x => x._1)

      val pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints(mapping._1), mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println(s"Final centers: $kPoints")
  }
}
// scalastyle:on println

Source File: LocalityPlacementStrategySuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import scala.collection.JavaConverters._
import scala.collection.mutable.{HashMap, HashSet, Set}

import org.apache.hadoop.yarn.api.records._
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.mockito.Mockito._

import org.apache.spark.{SparkConf, SparkFunSuite}

class LocalityPlacementStrategySuite extends SparkFunSuite {

  test("handle large number of containers and tasks (SPARK-18750)") {
    // Run the test in a thread with a small stack size, since the original issue
    // surfaced as a StackOverflowError.
    var error: Throwable = null

    val runnable = new Runnable() {
      override def run(): Unit = try {
        runTest()
      } catch {
        case e: Throwable => error = e
      }
    }

    val thread = new Thread(new ThreadGroup("test"), runnable, "test-thread", 32 * 1024)
    thread.start()
    thread.join()

    assert(error === null)
  }

  private def runTest(): Unit = {
    val yarnConf = new YarnConfiguration()

    // The numbers below have been chosen to balance being large enough to replicate the
    // original issue while not taking too long to run when the issue is fixed. The main
    // goal is to create enough requests for localized containers (so there should be many
    // tasks on several hosts that have no allocated containers).

    val resource = Resource.newInstance(8 * 1024, 4)
    val strategy = new LocalityPreferredContainerPlacementStrategy(new SparkConf(),
      yarnConf, resource, new MockResolver())

    val totalTasks = 32 * 1024
    val totalContainers = totalTasks / 16
    val totalHosts = totalContainers / 16

    val mockId = mock(classOf[ContainerId])
    val hosts = (1 to totalHosts).map { i => (s"host_$i", totalTasks % i) }.toMap
    val containers = (1 to totalContainers).map { i => mockId }
    val count = containers.size / hosts.size / 2

    val hostToContainerMap = new HashMap[String, Set[ContainerId]]()
    hosts.keys.take(hosts.size / 2).zipWithIndex.foreach { case (host, i) =>
      val hostContainers = new HashSet[ContainerId]()
      containers.drop(count * i).take(i).foreach { c => hostContainers += c }
      hostToContainerMap(host) = hostContainers
    }

    strategy.localityOfRequestedContainers(containers.size * 2, totalTasks, hosts,
      hostToContainerMap, Nil)
  }

}

Source File: JobSet.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.HashSet

import org.apache.spark.streaming.Time


private[streaming]
case class JobSet(
    time: Time,
    jobs: Seq[Job],
    streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) {

  private val incompleteJobs = new HashSet[Job]()
  private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
  private var processingStartTime = -1L // when the first job of this jobset started processing
  private var processingEndTime = -1L // when the last job of this jobset finished processing

  jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) }
  incompleteJobs ++= jobs

  def handleJobStart(job: Job) {
    if (processingStartTime < 0) processingStartTime = System.currentTimeMillis()
  }

  def handleJobCompletion(job: Job) {
    incompleteJobs -= job
    if (hasCompleted) processingEndTime = System.currentTimeMillis()
  }

  def hasStarted: Boolean = processingStartTime > 0

  def hasCompleted: Boolean = incompleteJobs.isEmpty

  // Time taken to process all the jobs from the time they started processing
  // (i.e. not including the time they wait in the streaming scheduler queue)
  def processingDelay: Long = processingEndTime - processingStartTime

  // Time taken to process all the jobs from the time they were submitted
  // (i.e. including the time they wait in the streaming scheduler queue)
  def totalDelay: Long = processingEndTime - time.milliseconds

  def toBatchInfo: BatchInfo = {
    BatchInfo(
      time,
      streamIdToInputInfo,
      submissionTime,
      if (hasStarted) Some(processingStartTime) else None,
      if (hasCompleted) Some(processingEndTime) else None,
      jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap
    )
  }
}

Source File: LocalKMeans.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println

Source File: CachedRDDManager.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution.recursion

import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import scala.collection.mutable.{HashMap, HashSet, Set}

class CachedRDDManager(defaultStorageLevel: StorageLevel)
  extends Logging with Serializable {

  val iterationToRDDMap = new HashMap[Int, HashSet[RDD[_]]]
  var currentIteration : Int = 0

  def persist(rdd: RDD[_]): Unit = {
    persist(rdd, false)
  }

  def persist(rdd: RDD[_], doMemoryCheckpoint: Boolean): Unit = {
    iterationToRDDMap.getOrElseUpdate(currentIteration, new HashSet[RDD[_]]).add(rdd)
    rdd.persist(defaultStorageLevel)

    if (doMemoryCheckpoint)
      rdd.memoryCheckpoint()
  }

  def cleanUpIteration(iterationsBackToRemove: Int = 2) = {
    val start = System.currentTimeMillis()
    if (currentIteration >= iterationsBackToRemove) {
      val iterationId = currentIteration - iterationsBackToRemove
      if (iterationToRDDMap.contains(iterationId)) {
        val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get
        if (rdds.nonEmpty)
          logInfo("Unpersisting "+rdds.size+" rdds for iteration " + iterationId)
        rdds.foreach(rdd => rdd.unpersist(false))
      }
    }
    logInfo("CleanUpIteration took " + (System.currentTimeMillis() - start) + " ms")
    currentIteration += 1
  }

  def cleanUpIterationById(iterationId: Int) = {
    if (iterationToRDDMap.contains(iterationId)) {
      val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get
      rdds.foreach(rdd => rdd.unpersist(false))
    }
  }

  def incrementIteration() { currentIteration += 1}

  def clear() = {
    iterationToRDDMap.clear()
  }

  def clear(remainCached: Seq[RDD[_]]) = {
    iterationToRDDMap.keySet.foreach(key => logInfo("key: " + key + " value: " + iterationToRDDMap.get(key)))

    iterationToRDDMap.keySet
      .foreach(key => iterationToRDDMap.get(key)
      .foreach(value => value.foreach(item => {if (!remainCached.contains(item)) item.unpersist(false)})))

    iterationToRDDMap.clear()
  }

  def unpersist(rdds: Set[RDD[_]]) = {
    for (rdd <- rdds) {
      iterationToRDDMap.synchronized {
        // rdd should only be in 1 iteration
        val iterations = iterationToRDDMap.filter(x => x._2.contains(rdd))
        if (iterations.nonEmpty) {
          val iteration = iterations.head
          iteration._2.remove(rdd)
          rdd.unpersist(false)
          if (iteration._2.isEmpty)
            iterationToRDDMap.remove(iteration._1)
        }
      }
    }
  }

  override def toString = {
    val output = new StringBuilder
    iterationToRDDMap.keySet.toSeq.sorted
      .foreach(iteration => {
        val rdds = iterationToRDDMap.get(iteration)
        rdds.foreach(rdd => output.append(iteration + ":" + rdd + "\n"))
      })
    output.toString()
  }
}

Source File: JobSet.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.HashSet

import org.apache.spark.streaming.Time


private[streaming]
case class JobSet(
    time: Time,
    jobs: Seq[Job],
    streamIdToNumRecords: Map[Int, Long] = Map.empty) {

  private val incompleteJobs = new HashSet[Job]()
  private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
  private var processingStartTime = -1L // when the first job of this jobset started processing
  private var processingEndTime = -1L // when the last job of this jobset finished processing

  jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) }
  incompleteJobs ++= jobs

  def handleJobStart(job: Job) {
    if (processingStartTime < 0) processingStartTime = System.currentTimeMillis()
  }

  def handleJobCompletion(job: Job) {
    incompleteJobs -= job
    if (hasCompleted) processingEndTime = System.currentTimeMillis()
  }

  def hasStarted: Boolean = processingStartTime > 0

  def hasCompleted: Boolean = incompleteJobs.isEmpty

  // Time taken to process all the jobs from the time they started processing
  // (i.e. not including the time they wait in the streaming scheduler queue)
  def processingDelay: Long = processingEndTime - processingStartTime

  // Time taken to process all the jobs from the time they were submitted
  // (i.e. including the time they wait in the streaming scheduler queue)
  def totalDelay: Long = {
    processingEndTime - time.milliseconds
  }

  def toBatchInfo: BatchInfo = {
    new BatchInfo(
      time,
      streamIdToNumRecords,
      submissionTime,
      if (processingStartTime >= 0 ) Some(processingStartTime) else None,
      if (processingEndTime >= 0 ) Some(processingEndTime) else None
    )
  }
}

Source File: JobSet.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.HashSet
import scala.util.Failure

import org.apache.spark.streaming.Time
import org.apache.spark.util.Utils


private[streaming]
case class JobSet(
    time: Time,
    jobs: Seq[Job],
    streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) {

  private val incompleteJobs = new HashSet[Job]()
  private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
  private var processingStartTime = -1L // when the first job of this jobset started processing
  private var processingEndTime = -1L // when the last job of this jobset finished processing

  jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) }
  incompleteJobs ++= jobs

  def handleJobStart(job: Job) {
    if (processingStartTime < 0) processingStartTime = System.currentTimeMillis()
  }

  def handleJobCompletion(job: Job) {
    incompleteJobs -= job
    if (hasCompleted) processingEndTime = System.currentTimeMillis()
  }

  def hasStarted: Boolean = processingStartTime > 0

  def hasCompleted: Boolean = incompleteJobs.isEmpty

  // Time taken to process all the jobs from the time they started processing
  // (i.e. not including the time they wait in the streaming scheduler queue)
  def processingDelay: Long = processingEndTime - processingStartTime

  // Time taken to process all the jobs from the time they were submitted
  // (i.e. including the time they wait in the streaming scheduler queue)
  def totalDelay: Long = {
    processingEndTime - time.milliseconds
  }

  def toBatchInfo: BatchInfo = {
    BatchInfo(
      time,
      streamIdToInputInfo,
      submissionTime,
      if (processingStartTime >= 0) Some(processingStartTime) else None,
      if (processingEndTime >= 0) Some(processingEndTime) else None,
      jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap
    )
  }
}

Source File: FixedPointJobDefinition.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler.fixedpoint

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD

import scala.collection.mutable.{HashSet, HashMap, Set}

class FixedPointJobDefinition(val setupIteration: (FixedPointJobDefinition, RDD[_]) => RDD[_],
                              val cleanupIteration: (Int) => Unit) {
  var _fixedPointEvaluator: (TaskContext, Iterator[_]) => Boolean = null
  var finalRDD: RDD[_] = null
  var rddIds = Array.empty[Int] // for all and delta rdd id for FixedPointResultTask execution on worker

  def fixedPointEvaluator(fixedPointEvaluator: (TaskContext, Iterator[_]) => Boolean) = {
    _fixedPointEvaluator = fixedPointEvaluator
  }

  def getfixedPointEvaluator = _fixedPointEvaluator.asInstanceOf[(TaskContext, Iterator[_]) => _]

  def getFinalRDD: RDD[_] = finalRDD

  def setRDDIds(newAllRDDId: Int,
                oldAllRDDId: Int,
                newDeltaPrimeRDDId: Int,
                oldDeltaPrimeRDDId: Int): Unit = {

    rddIds = Array(newAllRDDId, oldAllRDDId, newDeltaPrimeRDDId, oldDeltaPrimeRDDId)
  }
}

Source File: LocalKMeans.scala From learning-spark with Apache License 2.0

5 votes

package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData = {
    def generatePoint(i: Int) = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}

Source File: HogHBaseReputation.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.hbase




import scala.math.random
import java.lang.Math
import org.apache.spark._
import org.apache.hadoop.hbase.client.HBaseAdmin
import org.apache.hadoop.hbase.util.Bytes
import org.apache.hadoop.hbase.{HBaseConfiguration, HTableDescriptor, TableName}
import org.apache.hadoop.hbase.mapreduce.TableInputFormat
import org.apache.spark.mllib.regression.{LabeledPoint,LinearRegressionModel,LinearRegressionWithSGD}
import org.apache.spark.mllib.linalg.Vectors
import org.apache.spark.rdd.RDD
import org.apache.hadoop.hbase.client.HTable
import org.apache.hadoop.hbase.filter.SingleColumnValueFilter
import org.apache.hadoop.hbase.filter.BinaryComparator
import org.apache.hadoop.hbase.filter.FilterList
import org.apache.hadoop.hbase.filter.CompareFilter
import java.util.ArrayList
import org.apache.hadoop.hbase.client.Scan
import org.apache.hadoop.hbase.filter.Filter
import scala.collection.mutable.HashSet
import org.apache.hadoop.hbase.client.Put


object HogHBaseReputation {

  // Ex: MX, whitelist
	def getReputationList(listName:String, listType:String):Set[String] =
	{
		val list =  new HashSet[String]


	  val filters: ArrayList[Filter] = new ArrayList();

		val colValFilter1 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list_type"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listType)))
		colValFilter1.setFilterIfMissing(false);

		val colValFilter2 = new SingleColumnValueFilter(Bytes.toBytes("rep"), Bytes.toBytes("list"),
				CompareFilter.CompareOp.EQUAL, new BinaryComparator(Bytes.toBytes(listName)))
		colValFilter2.setFilterIfMissing(false);

		filters.add(colValFilter1);
		filters.add(colValFilter2);

		val filterList = new FilterList( FilterList.Operator.MUST_PASS_ALL, filters);
		val scan = new Scan()
		scan.setFilter(filterList)
    
		val it = HogHBaseRDD.hogzilla_reputation.getScanner(scan).iterator()
		
    while(it.hasNext())
		{
      list.add( Bytes.toString(it.next().getValue(Bytes.toBytes("rep"),Bytes.toBytes("ip"))) )
		}
    
    list.toSet

	}
  
 def saveReputationList(listName:String, listType:String, ip:String) =
 {
     val put = new Put(Bytes.toBytes(ip+"-"+listName+"-"+listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list_type"), Bytes.toBytes(listType))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("list"), Bytes.toBytes(listName))
     put.add(Bytes.toBytes("rep"), Bytes.toBytes("ip"), Bytes.toBytes(ip))
     
     HogHBaseRDD.hogzilla_reputation.put(put)
 }

}

Source File: HogConfig.scala From hogzilla with GNU General Public License v2.0

5 votes

package org.hogzilla.util

import java.security.MessageDigest
import org.apache.hadoop.hbase.util.Bytes
import javax.xml.bind.DatatypeConverter
import math._
import com.typesafe.config.Config
import scala.collection.mutable.HashSet



object HogConfig {
  
   
  
  def get(config:Config,key:String,valueType:String,default:Any):Any =
  {
    if(config==null)
      return default
    
    
      try {
        
        val value = config.getString(key)
        
        if(value.isEmpty())
          return default // Return default value
        
        println(f"Configuration: $key => $value")
       
        if(valueType.equals("Int"))
          value.toInt 
        else if(valueType.equals("Double"))
          value.toDouble 
        else if(valueType.equals("Long"))
          value.toLong 
        else if(valueType.equals("Set(Int)"))
        {
          val patternSet="Set\\(".r
          val patternSetEnd="\\)".r
          
          if(value.equals("Set()"))
            return Set()
          
          return (patternSetEnd replaceAllIn((patternSet replaceAllIn(value, "")),""))
                  .split(",").map({x => x.toInt}).toSet
        }
        else if(valueType.equals("Set(String)"))
        {
          val patternSet="Set\\(".r
          val patternSetEnd="\\)".r
          
          if(value.equals("Set()"))
            return Set()
          
          return (patternSetEnd replaceAllIn((patternSet replaceAllIn(value, "")),""))
                  .split(",").map({x => println(x.toString.trim()) ; x.toString.trim()}).toSet
        }
        else
          default // Create type first
          
      } catch {
        case t: Throwable => t.printStackTrace() 
        println(f"Problem parsing $key . Check if it is ok. Using default value")
        
        return default
      } 
  
  }
  
  def getInt(config:Config,key:String,default:Any):Int =
  {
    get(config,key,"Int",default).asInstanceOf[Int]
  }
  
  def getLong(config:Config,key:String,default:Any):Long =
  {
    get(config,key,"Long",default).asInstanceOf[Long]
  }
  
  def getDouble(config:Config,key:String,default:Any):Double =
  {
    get(config,key,"Double",default).asInstanceOf[Long]
  }
  
  def getSetInt(config:Config,key:String,default:Any):Set[Int] =
  {
    get(config,key,"Set(Int)",default).asInstanceOf[Set[Int]]
  }
  
  def getSetString(config:Config,key:String,default:Any):Set[String] =
  {
    get(config,key,"Set(String)",default).asInstanceOf[Set[String]]
  }
   

}

Source File: StopwordDict.scala From berkeley-doc-summarizer with GNU General Public License v3.0

5 votes

package edu.berkeley.nlp.summ.data

import scala.collection.mutable.HashSet

object StopwordDict {

  // N.B. This set was extracted from the RST treebank (train and test) mostly to reproduce
  // Hirao's results; it shouldn't really be used for other things
  val stopwords = Set("!", "", "#", "$", "%", "&", "'", "''", "'S", "'s", "()", ",", "-", "--", "-owned", ".", "", ":", ";", "<", "?", "",
                      "A", "A.", "", "AND", "After", "All", "Am", "An", "And", "Any", "As", "At", "BE", "Between", "Both", "But", "By", "Each",
                      "Few", "For", "From", "Had", "He", "Here", "How", "I", "If", "In", "Is", "It", "Its", "MORE", "More", "Most", "NO", "No", "No.",
                      "Not", "OF", "Of", "On", "One", "Only", "Or", "Other", "Our", "Over", "She", "So", "Some", "Such", "THE", "Than", "That", "The",
                      "Their", "Then", "There", "These", "They", "Those", "To", "UPS", "Under", "Until", "WHY", "We", "What", "When", "While", "Why",
                      "Would", "You", "`It", "``", "a", "about", "above", "after", "again", "again.", "", "against", "all", "am", "an", "and", "any",
                      "as", "at", "be", "been", "being", "below", "between", "both", "but", "by", "ca", "can", "could", "did", "do", "doing", "down",
                      "each", "few", "for", "from", "further", "had", "have", "having", "he", "her", "here", "herself", "him", "him.", "", "himself",
                      "how", "if", "in", "into", "is", "it", "its", "itself", "let", "lets", "me", "more", "most", "must", "my", "n't", "no", "nor",
                      "not", "of", "off", "on", "one", "ones", "only", "or", "other", "others", "ought", "our", "out", "over", "own", "owned", "owns",
                      "same", "she", "should", "so", "some", "such", "than", "that", "the", "their", "them", "then", "there", "these", "they", "those",
                      "through", "to", "too", "under", "until", "up", "very", "we", "were", "what", "when", "where", "which", "while", "who", "whom",
                      "why", "with", "wo", "would", "you", "your", "yourself", "{", "}")
  // Leave $ in there
  val stopwordTags = new HashSet[String] ++ Array("CC", "DT", "EX", "IN", "LS", "MD", "PDT", "POS", "PRN", "PRP", "PRP$", "RP", "SYM",
                                                  "TO", "WDT", "WP", "WP$", "WRB", ".", ",", "``", "''", ";", ":", "-LRB-", "-RRB-", "-LSB-", "-RSB-", "-LCB-", "-RCB-")
                                                  
}

Source File: DepParseDoc.scala From berkeley-doc-summarizer with GNU General Public License v3.0

5 votes

package edu.berkeley.nlp.summ.data

import scala.collection.mutable.HashSet
import scala.collection.mutable.ArrayBuffer

trait DepParseDoc extends Serializable {
  
  def name: String
  def doc: Seq[DepParse]
  def summary: Seq[DepParse]

  override def toString() = {
    toString(Int.MaxValue)
  }
  
  def toString(maxNumSentences: Int) = {
    "DOCUMENT:\n" + doc.map(_.getWords.reduce(_ + " " + _)).slice(0, Math.min(maxNumSentences, doc.size)).reduce(_ + "\n" + _) +
    "\nSUMMARY:\n" + summary.map(_.getWords.reduce(_ + " " + _)).slice(0, Math.min(maxNumSentences, doc.size)).reduce(_ + "\n" + _)
  }
}

Source File: KNN.scala From spark3D with Apache License 2.0

5 votes

package com.astrolabsoftware.spark3d.spatialOperator

import com.astrolabsoftware.spark3d.geometryObjects.Point3D
import com.astrolabsoftware.spark3d.geometryObjects.Shape3D.Shape3D
import com.astrolabsoftware.spark3d.utils.GeometryObjectComparator
import com.astrolabsoftware.spark3d.utils.Utils.takeOrdered
import com.astrolabsoftware.spark3d.spatialPartitioning._

import org.apache.spark.rdd.RDD

import scala.collection.mutable.{HashSet, ListBuffer}
import scala.collection.JavaConverters._
import scala.reflect.ClassTag
import scala.util.control.Breaks._

object KNN {

  
  def KNNStandard[T <: Shape3D: ClassTag](
      rdd: RDD[T], queryObject: T,
      k: Int, unique: Boolean = false): List[T] = {
    val knn = takeOrdered[T](rdd, k, queryObject, unique)(
      new GeometryObjectComparator[T](queryObject.center)
    )
    knn.toList
  }
}

Source File: Rule.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg.parser



import jigg.nlp.ccg.lexicon.{Category, Derivation, Point, UnaryChildPoint, BinaryChildrenPoints, AppliedRule}

import scala.collection.mutable.{HashMap, HashSet}
import java.io.{ObjectOutputStream, ObjectInputStream}

trait Rule {
  def unify(left:Category, right:Category): Option[Array[(Category, String)]]
  def raise(child:Category): Option[Array[(Category, String)]]
  def headFinder:HeadFinder
}

// rules are restricted to CFG rules extracted from the training CCGBank
case class CFGRule(val binaryRules:Map[(Int,Int), Array[(Category, String)]], // category ids -> (category, ruleType)
                   val unaryRules:Map[Int, Array[(Category, String)]],
                   override val headFinder:HeadFinder) extends Rule {
  def unify(left:Category, right:Category):Option[Array[(Category, String)]] = binaryRules.get((left.id, right.id))
  def raise(child:Category):Option[Array[(Category, String)]] = unaryRules.get(child.id)
}

object CFGRule {
  def extractRulesFromDerivations(derivations: Array[Derivation], headFinder:HeadFinder): CFGRule = {
    val binaryRules = new HashMap[(Int, Int), HashSet[(Category, String)]]
    val unaryRules = new HashMap[Int, HashSet[(Category, String)]]

    derivations.foreach { deriv =>
      deriv.foreachPoint({ point:Point => deriv.get(point) match {
        case Some(AppliedRule(UnaryChildPoint(child), ruleType)) =>
          val parents = unaryRules.getOrElseUpdate(child.category.id, new HashSet[(Category, String)])
          parents += ((point.category, ruleType))
        case Some(AppliedRule(BinaryChildrenPoints(left, right), ruleType)) =>
          val parents = binaryRules.getOrElseUpdate((left.category.id, right.category.id), new HashSet[(Category, String)])
          parents += ((point.category, ruleType))
        case _ =>
      }})
    }
    new CFGRule(binaryRules.map { case (k, v) => k -> v.toArray }.toMap,
                unaryRules.map { case (k, v) => k -> v.toArray }.toMap,
                headFinder)
  }
}

Source File: BytecodeUtils.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.graphx.util

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable.HashSet
import scala.language.existentials

import org.apache.xbean.asm5.{ClassReader, ClassVisitor, MethodVisitor}
import org.apache.xbean.asm5.Opcodes._

import org.apache.spark.util.Utils


  private class MethodInvocationFinder(className: String, methodName: String)
    extends ClassVisitor(ASM5) {

    val methodsInvoked = new HashSet[(Class[_], String)]

    override def visitMethod(access: Int, name: String, desc: String,
                             sig: String, exceptions: Array[String]): MethodVisitor = {
      if (name == methodName) {
        new MethodVisitor(ASM5) {
          override def visitMethodInsn(
              op: Int, owner: String, name: String, desc: String, itf: Boolean) {
            if (op == INVOKEVIRTUAL || op == INVOKESPECIAL || op == INVOKESTATIC) {
              if (!skipClass(owner)) {
                methodsInvoked.add((Utils.classForName(owner.replace("/", ".")), name))
              }
            }
          }
        }
      } else {
        null
      }
    }
  }
}

Source File: JobSet.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.HashSet

import org.apache.spark.streaming.Time


private[streaming]
case class JobSet(
    time: Time,
    jobs: Seq[Job],
    streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) {

  private val incompleteJobs = new HashSet[Job]()
  private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
  private var processingStartTime = -1L // when the first job of this jobset started processing
  private var processingEndTime = -1L // when the last job of this jobset finished processing

  jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) }
  incompleteJobs ++= jobs

  def handleJobStart(job: Job) {
    if (processingStartTime < 0) processingStartTime = System.currentTimeMillis()
  }

  def handleJobCompletion(job: Job) {
    incompleteJobs -= job
    if (hasCompleted) processingEndTime = System.currentTimeMillis()
  }

  def hasStarted: Boolean = processingStartTime > 0

  def hasCompleted: Boolean = incompleteJobs.isEmpty

  // Time taken to process all the jobs from the time they started processing
  // (i.e. not including the time they wait in the streaming scheduler queue)
  def processingDelay: Long = processingEndTime - processingStartTime

  // Time taken to process all the jobs from the time they were submitted
  // (i.e. including the time they wait in the streaming scheduler queue)
  def totalDelay: Long = processingEndTime - time.milliseconds

  def toBatchInfo: BatchInfo = {
    BatchInfo(
      time,
      streamIdToInputInfo,
      submissionTime,
      if (hasStarted) Some(processingStartTime) else None,
      if (hasCompleted) Some(processingEndTime) else None,
      jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap
    )
  }
}

Source File: TaskDescription.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.nio.ByteBuffer

import scala.collection.mutable
import scala.collection.mutable.HashSet
import scala.util.control.NonFatal

import org.apache.spark._
import org.apache.spark.internal.Logging
import org.apache.spark.serializer.SerializerInstance
import org.apache.spark.util.SerializableBuffer


private[spark] class TaskDescription(
    val taskId: Long,
    val attemptNumber: Int,
    val executorId: String,
    val name: String,
    val index: Int,    // Index within this task's TaskSet
    val isFutureTask: Boolean,
    @transient private val _task: Task[_],
    @transient private val _addedFiles: mutable.Map[String, Long],
    @transient private val _addedJars: mutable.Map[String, Long],
    @transient private val _ser: SerializerInstance)
  extends Serializable with Logging {

  // Because ByteBuffers are not serializable, wrap the task in a SerializableBuffer
  private var buffer: SerializableBuffer = _

  def prepareSerializedTask(): Unit = {
    if (_task != null) {
      val serializedTask: ByteBuffer = try {
        Task.serializeWithDependencies(_task, _addedFiles, _addedJars, _ser)
      } catch {
        // If the task cannot be serialized, then there is not point in re-attempting
        // the task as it will always fail. So just abort the task set.
        case NonFatal(e) =>
          val msg = s"Failed to serialize the task $taskId, not attempting to retry it."
          logError(msg, e)
          // FIXME(shivaram): We dont have a handle to the taskSet here to abort it.
          throw new TaskNotSerializableException(e)
      }
      if (serializedTask.limit > TaskSetManager.TASK_SIZE_TO_WARN_KB * 1024) {
        logWarning(s"Stage ${_task.stageId} contains a task of very large size " +
          s"(${serializedTask.limit / 1024} KB). The maximum recommended task size is " +
          s"${TaskSetManager.TASK_SIZE_TO_WARN_KB} KB.")
      }
      buffer = new SerializableBuffer(serializedTask)
    } else {
      buffer = new SerializableBuffer(ByteBuffer.allocate(0))
    }
  }

  def serializedTask: ByteBuffer = buffer.value

  override def toString: String = "TaskDescription(TID=%d, index=%d)".format(taskId, index)
}

Source File: FutureTaskWaiter.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashSet

import org.apache.spark.internal.Logging
import org.apache.spark.MapOutputTracker
import org.apache.spark.SparkConf
import org.apache.spark.storage.BlockManager
import org.apache.spark.storage.ShuffleBlockId
import org.apache.spark.util.TimeStampedHashMap

private[spark] case class FutureTaskInfo(shuffleId: Int, numMaps: Int, reduceId: Int, taskId: Long,
  nonZeroPartitions: Option[Array[Int]], taskCb: () => Unit)

private[spark] class FutureTaskWaiter(
    conf: SparkConf,
    blockManager: BlockManager,
    mapOutputTracker: MapOutputTracker) extends Logging {

  // Key is (shuffleId, reduceId)
  private val futureTaskInfo = new TimeStampedHashMap[(Int, Int), FutureTaskInfo]
  // Key is (shuffleId, reduceId), value is the set of blockIds we are waiting for
  private val futureTasksBlockWait = new TimeStampedHashMap[(Int, Int), HashSet[Int]]

  
  def submitFutureTask(info: FutureTaskInfo) {
    futureTasksBlockWait.synchronized {
      val blocksToWaitFor = if (info.nonZeroPartitions.isDefined) {
        info.nonZeroPartitions.get.toSet
      } else {
        (0 until info.numMaps).toArray.toSet
      }

      // Check if all the blocks already exist. If so just trigger taskCb
      // Count how many outputs have been registered with the MapOutputTracker for this shuffle
      // and intersect with blocksToWaitFor to only get how many for this reduce are available
      val availableBlocks =
        mapOutputTracker.getAvailableMapOutputs(info.shuffleId).intersect(blocksToWaitFor)
      val mapsToWait = blocksToWaitFor.size
      val numMapsPending = blocksToWaitFor.size - availableBlocks.size

      if (availableBlocks.size >= mapsToWait) {
        info.taskCb()
      } else {
        futureTaskInfo.put((info.shuffleId, info.reduceId), info)
        // NOTE: Its fine not to synchronize here as two future tasks shouldn't be submitted at the
        // same time Calculate the number of blocks to wait for before starting future task
        val waitForBlocks = blocksToWaitFor.diff(availableBlocks)
        futureTasksBlockWait.put(
          (info.shuffleId, info.reduceId), new HashSet[Int]() ++ waitForBlocks)
      }
    }
  }

  def shuffleBlockReady(shuffleBlockId: ShuffleBlockId): Unit = {
    val key = (shuffleBlockId.shuffleId, shuffleBlockId.reduceId)
    futureTasksBlockWait.synchronized {
      if (futureTaskInfo.contains(key)) {
        if (futureTasksBlockWait.contains(key)) {
          futureTasksBlockWait(key) -= shuffleBlockId.mapId
          // If we have all the blocks, run the CB
          if (futureTasksBlockWait(key).size <= 0) {
            val cb = futureTaskInfo(key).taskCb
            futureTasksBlockWait.remove(key)
            futureTaskInfo.remove(key)
            cb()
          }
        }
      }
    }
  }

  def addMapStatusAvailable(shuffleId: Int, mapId: Int, numReduces: Int, mapStatus: MapStatus) {
    // NOTE: This should be done before we trigger future tasks.
    mapOutputTracker.addStatus(shuffleId, mapId, mapStatus)
    futureTasksBlockWait.synchronized {
      // Register the output for each reduce task.
      (0 until numReduces).foreach { reduceId =>
        shuffleBlockReady(new ShuffleBlockId(shuffleId, mapId, reduceId))
      }
    }
  }

}

Source File: SurfaceFormDictionary.scala From dbpedia-spotlight-model with Apache License 2.0

5 votes

package org.dbpedia.spotlight.spot.opennlp

import java.io.File

import org.dbpedia.spotlight.util.bloomfilter.LongFastBloomFilter

import scala.collection.mutable.HashSet


object SurfaceFormDictionary {
  def fromIterator(entries: scala.collection.Iterator[String],
                   surfaceformDictionary: SurfaceFormDictionary = new ExactSurfaceFormDictionary())
    : SurfaceFormDictionary = {

    entries.foreach(line => surfaceformDictionary.add(line))
    surfaceformDictionary
  }

}

object ProbabilisticSurfaceFormDictionary {
  def fromFile(dictionaryFile: File, caseSensitive: Boolean = true) : SurfaceFormDictionary = {
    SurfaceFormDictionary.fromIterator(io.Source.fromFile(dictionaryFile).getLines(),
      new ProbabilisticSurfaceFormDictionary(io.Source.fromFile(dictionaryFile).size, caseSensitive))
  }
}

object ExactSurfaceFormDictionary {
  def fromFile(dictionaryFile: File, caseSensitive: Boolean = true) : SurfaceFormDictionary = {
    SurfaceFormDictionary.fromIterator(io.Source.fromFile(dictionaryFile).getLines(),
      new ExactSurfaceFormDictionary(caseSensitive))
  }
}

Source File: Flows.scala From spatial with MIT License

5 votes

package argon

import scala.collection.mutable.{ArrayBuffer,HashSet}

import utils.Instrument

trait FlowRules {
  val IR: State

}


class Flows {
  private var rules = ArrayBuffer[(String,PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit])]()
  private[argon] var names = HashSet[String]()

  lazy val instrument = new Instrument("flows")

  def prepend(name: String, func: PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit]): Unit = {
    rules.prepend((name,func))
    names += name
  }

  def add(name: String, func: PartialFunction[(Sym[_],Op[_],SrcCtx,State),Unit]): Unit = {
    rules += ((name,func))
    names += name
  }
  def remove(name: String): Unit = {
    val idx = rules.indexWhere(_._1 == name)
    rules.remove(idx)
    names.remove(name)
  }

  def apply[A](lhs: Sym[A], rhs: Op[A])(implicit ctx: SrcCtx, state: State): Unit = {
    val tuple = (lhs,rhs,ctx,state)
    rules.foreach{case (name,rule) =>
      if (rule.isDefinedAt(tuple)) { instrument(name){ rule.apply(tuple) } }
    }
  }

  def save(): Flows = {
    val flows = new Flows
    flows.rules ++= rules
    flows.names ++= names
    flows
  }
  def restore(flow: Flows): Unit = {
    rules = flow.rules
    names = flow.names
  }
}

Source File: IdentifyFeyActors.scala From incubator-retired-iota with Apache License 2.0

5 votes

package org.apache.iota.fey

import akka.actor.{Actor, ActorIdentity, ActorLogging, ActorPath, Identify}
import akka.routing.{ActorRefRoutee, GetRoutees, Routees}
import play.api.libs.json._

import scala.collection.mutable.HashSet

protected class IdentifyFeyActors extends Actor with ActorLogging {

  import IdentifyFeyActors._

  override def receive: Receive = {
    case IDENTIFY_TREE(startPath) =>
      log.info("Current Actors in system:")
      actorsPath = HashSet.empty
      rootPath = startPath
      log.info(startPath)
      self ! ActorPath.fromString(startPath)

    case path: ActorPath =>
      context.actorSelection(path / "*") ! Identify(())
      context.actorSelection(path / "*") ! GetRoutees

    case ActorIdentity(_, Some(ref)) =>
      actorsPath.add(ref.path.toString)
      log.info(ref.path.toString)
      self ! ref.path

    case routees:Routees =>
      routees.routees
        .map(_.asInstanceOf[ActorRefRoutee])
        .foreach(routee => {
          log.info(routee.ref.path.toString)
          actorsPath.add(routee.ref.path.toString)
        })

    case _ =>
  }
}

protected object IdentifyFeyActors{

  
  def generateTreeJson(): String = {
    val trie = new Trie("FEY-MANAGEMENT-SYSTEM")
    actorsPath.map(_.replace("user/","")).foreach(trie.append(_))

    Json.stringify(trie.print)
  }

  //Static HTML content from d3
  val html = scala.io.Source.fromInputStream(getClass.getResourceAsStream("/d3Tree.html"), "UTF-8")
    .getLines()
    .mkString("\n")

  def getHTMLTree(json: String): String = {
   html.replace("$MYJSONHIERARCHY", json)
  }

}

Source File: BytecodeUtils.scala From graphx-algorithm with GNU General Public License v2.0

5 votes

package org.apache.spark.graphx.util

import java.io.{ByteArrayInputStream, ByteArrayOutputStream}

import scala.collection.mutable.HashSet
import scala.language.existentials

import org.apache.spark.util.Utils

import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.{ClassReader, ClassVisitor, MethodVisitor}
import com.esotericsoftware.reflectasm.shaded.org.objectweb.asm.Opcodes._



  private class MethodInvocationFinder(className: String, methodName: String)
    extends ClassVisitor(ASM4) {

    val methodsInvoked = new HashSet[(Class[_], String)]

    override def visitMethod(access: Int, name: String, desc: String,
                             sig: String, exceptions: Array[String]): MethodVisitor = {
      if (name == methodName) {
        new MethodVisitor(ASM4) {
          override def visitMethodInsn(op: Int, owner: String, name: String, desc: String) {
            if (op == INVOKEVIRTUAL || op == INVOKESPECIAL || op == INVOKESTATIC) {
              if (!skipClass(owner)) {
                methodsInvoked.add((Class.forName(owner.replace("/", ".")), name))
              }
            }
          }
        }
      } else {
        null
      }
    }
  }
}

Source File: AllowRule.scala From Hive-JDBC-Proxy with Apache License 2.0

5 votes

package com.enjoyyin.hive.proxy.jdbc.rule

import com.enjoyyin.hive.proxy.jdbc.thrift.ProxySession
import com.enjoyyin.hive.proxy.jdbc.domain.User
import com.enjoyyin.hive.proxy.jdbc.thrift.EventInfo
import com.enjoyyin.hive.proxy.jdbc.domain.UserHQL
import com.enjoyyin.hive.proxy.jdbc.domain.ThriftServerName
import com.enjoyyin.hive.proxy.jdbc.util.ProxyConf
import java.util.HashMap
import scala.collection.JavaConversions._
import scala.collection.mutable.HashSet
import com.enjoyyin.hive.proxy.jdbc.rule.basic.DefaultThriftServerNameRule
import com.enjoyyin.hive.proxy.jdbc.util.Logging
import com.enjoyyin.hive.proxy.jdbc.domain.HQLPriority
import com.enjoyyin.hive.proxy.jdbc.rule.basic.BalancerInfo



  override def dealOrNot(params: Map[String, String]): ThriftServerName
  
  def canDeal(params: Map[String, String]): Boolean
}

object ThriftServerNameRule extends Logging{
  val THRIFT_CONNECTION_NAME = ProxyConf.THRIFT_CONNECTION_NAME
  val USERNAME_NAME = "username"
  val IPADDRESS_NAME = "ipAddress"
  
  type JMap[K, V] = java.util.Map[K, V]
  
	private val registeredRules: HashSet[ThriftServerNameRule] = HashSet[ThriftServerNameRule]()
  
  private def toParamsMap(conf: JMap[String, String], username: String, ipAddress: String): Map[String, String] = {
    var params = conf
    if(conf == null) {
      params = new HashMap[String, String]
    }
    params += USERNAME_NAME -> username
    params += IPADDRESS_NAME -> ipAddress
    params.toMap
  }
  
  private def register(ruleName: String): Unit = {
    val ruleClass = Class.forName(ruleName).newInstance.asInstanceOf[ThriftServerNameRule]
    registeredRules.synchronized(registeredRules += ruleClass)
    logInfo("Registered a thrift-server-name-rule " + ruleName)
  }
  
  def register(ruleNames: Array[String]): Unit = {
    if(ruleNames.isEmpty) return
    registeredRules.synchronized {
      registeredRules.clear
      ruleNames.foreach(register)
    }
  }
  
  def getThriftServerName(conf: JMap[String, String], username: String, ipAddress: String): ThriftServerName = {
    val params = toParamsMap(conf, username, ipAddress)
    var rule = registeredRules.synchronized(registeredRules.find(_.canDeal(params)))
    if(rule.isEmpty) {
      rule = Some(DefaultThriftServerNameRule)
    }
    rule.get.dealOrNot(params)
  }
}

Source File: LocalKMeans.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println

Source File: CategoryFeatureTest.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg.lexicon


import org.scalatest.FunSuite
import org.scalatest.Matchers._
import scala.collection.mutable.HashSet

class JPCategoryFeatureTest extends FunSuite {
  test("equal test") {
    val feat1 = JPCategoryFeature.createFromValues(List("adn","attr","ga"))
    val feat2 = JPCategoryFeature.createFromValues(List("nm","attr","ga"))
    val feat3 = JPCategoryFeature.createFromValues(List("adn","attr"))
    val feat4 = JPCategoryFeature.createFromValues(List("adn","attr","ga"))

    feat1.kvs should equal (feat4.kvs)
    feat1.kvs should not equal (feat2.kvs)
    feat1.kvs should not equal (feat3.kvs)
  }
}

Source File: LocalKMeans.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println

Source File: JobSet.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.HashSet

import org.apache.spark.streaming.Time


private[streaming]
case class JobSet(
    time: Time,
    jobs: Seq[Job],
    streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) {

  private val incompleteJobs = new HashSet[Job]()
  private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
  private var processingStartTime = -1L // when the first job of this jobset started processing
  private var processingEndTime = -1L // when the last job of this jobset finished processing

  jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) }
  incompleteJobs ++= jobs

  def handleJobStart(job: Job) {
    if (processingStartTime < 0) processingStartTime = System.currentTimeMillis()
  }

  def handleJobCompletion(job: Job) {
    incompleteJobs -= job
    if (hasCompleted) processingEndTime = System.currentTimeMillis()
  }

  def hasStarted: Boolean = processingStartTime > 0

  def hasCompleted: Boolean = incompleteJobs.isEmpty

  // Time taken to process all the jobs from the time they started processing
  // (i.e. not including the time they wait in the streaming scheduler queue)
  def processingDelay: Long = processingEndTime - processingStartTime

  // Time taken to process all the jobs from the time they were submitted
  // (i.e. including the time they wait in the streaming scheduler queue)
  def totalDelay: Long = processingEndTime - time.milliseconds

  def toBatchInfo: BatchInfo = {
    BatchInfo(
      time,
      streamIdToInputInfo,
      submissionTime,
      if (hasStarted) Some(processingStartTime) else None,
      if (hasCompleted) Some(processingEndTime) else None,
      jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap
    )
  }
}

Source File: BytecodeUtil.scala From sbt-jni with BSD 3-Clause "New" or "Revised" License

5 votes

package ch.jodersky.sbt.jni
package util

import java.io.{ File, FileInputStream, Closeable }
import scala.collection.mutable.{ HashSet }

import org.objectweb.asm.{ ClassReader, ClassVisitor, MethodVisitor, Opcodes }

object BytecodeUtil {

  private class NativeFinder extends ClassVisitor(Opcodes.ASM5) {

    // classes found to contain at least one @native def
    val _nativeClasses = new HashSet[String]
    def nativeClasses = _nativeClasses.toSet

    private var fullyQualifiedName: String = ""

    override def visit(version: Int, access: Int, name: String, signature: String,
      superName: String, interfaces: Array[String]): Unit = {
      fullyQualifiedName = name.replaceAll("/", ".")
    }

    override def visitMethod(access: Int, name: String, desc: String,
      signature: String, exceptions: Array[String]): MethodVisitor = {

      val isNative = (access & Opcodes.ACC_NATIVE) != 0

      if (isNative) {
        _nativeClasses += fullyQualifiedName
      }

      null //return null, do not visit method further
    }

  }

  private def using[A >: Null <: Closeable, R](mkStream: => A)(action: A => R): R = {
    var stream: A = null
    try {
      stream = mkStream
      action(stream)
    } finally {
      if (stream != null) {
        stream.close()
      }
    }
  }

  
  def nativeClasses(classFile: File): Set[String] = using(new FileInputStream(classFile)) { in =>
    val reader = new ClassReader(in)
    val finder = new NativeFinder
    reader.accept(finder, 0)
    finder.nativeClasses
  }

}

Source File: Labels.scala From jgo with GNU General Public License v3.0

5 votes

package jgo.tools.compiler
package parser.stmts

import parser.exprs._
import parser.scoped._
import parser.funcs._

import interm._
import types._
import symbol._
import codeseq._
import instr._

import scala.collection.mutable.{HashMap, HashSet, ListBuffer}
import scala.{collection => coll}
import coll.{immutable => imm}


trait Labels {
  private val seenDefs   = HashSet[String]()
  private val unseenDefs = HashMap[String, ListBuffer[Pos]]()
  private val lbls = HashMap[String, UserLabel]()
  
  def defLabel(name: String, pos: Pos): (String, Err[UserLabel]) =
    if (seenDefs contains name)
      (name, problem("label %s already defined", name)(pos))
    else {
      seenDefs += name
      unseenDefs -= name
      val label = lbls getOrElseUpdate (name, new UserLabel(name))
      (name, result(label))
    }
  
  def useLabel(pos: Pos, name: String): UserLabel = {
    if (!(seenDefs contains name))
      unseenDefs.getOrElseUpdate(name, new ListBuffer) += pos
    lbls getOrElseUpdate (name, new UserLabel(name))
  }
  
  def procGoto(pos: Pos, name: String): Err[CodeBuilder] = {
    result(Goto(useLabel(pos, name)))
  }
  
  def checkForUndefedLabels: Err[Unit] = {
    var issues: Err[Unit] = result(())
    for ((lblName, positions) <- unseenDefs; pos <- positions) {
      issues = issues then problem("target label not found: %s", lblName)(pos)
    }
    issues
  }
}

Source File: LocalKMeans.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println

Source File: JobSet.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.scheduler

import scala.collection.mutable.HashSet

import org.apache.spark.streaming.Time


private[streaming]
case class JobSet(
    time: Time,
    jobs: Seq[Job],
    streamIdToInputInfo: Map[Int, StreamInputInfo] = Map.empty) {

  private val incompleteJobs = new HashSet[Job]()
  private val submissionTime = System.currentTimeMillis() // when this jobset was submitted
  private var processingStartTime = -1L // when the first job of this jobset started processing
  private var processingEndTime = -1L // when the last job of this jobset finished processing

  jobs.zipWithIndex.foreach { case (job, i) => job.setOutputOpId(i) }
  incompleteJobs ++= jobs

  def handleJobStart(job: Job) {
    if (processingStartTime < 0) processingStartTime = System.currentTimeMillis()
  }

  def handleJobCompletion(job: Job) {
    incompleteJobs -= job
    if (hasCompleted) processingEndTime = System.currentTimeMillis()
  }

  def hasStarted: Boolean = processingStartTime > 0

  def hasCompleted: Boolean = incompleteJobs.isEmpty

  // Time taken to process all the jobs from the time they started processing
  // (i.e. not including the time they wait in the streaming scheduler queue)
  def processingDelay: Long = processingEndTime - processingStartTime

  // Time taken to process all the jobs from the time they were submitted
  // (i.e. including the time they wait in the streaming scheduler queue)
  def totalDelay: Long = processingEndTime - time.milliseconds

  def toBatchInfo: BatchInfo = {
    BatchInfo(
      time,
      streamIdToInputInfo,
      submissionTime,
      if (hasStarted) Some(processingStartTime) else None,
      if (hasCompleted) Some(processingEndTime) else None,
      jobs.map { job => (job.outputOpId, job.toOutputOperationInfo) }.toMap
    )
  }
}

Source File: LocalKMeans.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}

scala.collection.mutable.HashSet Scala Examples