scala.collection.mutable.HashMap Scala Example

Source File: IOCommon.scala From Swallow with Apache License 2.0

5 votes

package com.intel.hibench.sparkbench.common

import java.io.{File, FileInputStream, IOException, InputStreamReader}
import java.util.Properties

import org.apache.hadoop.io.compress.CompressionCodec
import org.apache.hadoop.io.{NullWritable, Text}
import org.apache.hadoop.mapred.SequenceFileOutputFormat
import org.apache.spark.rdd.RDD
import org.apache.spark.{SparkContext, SparkException}

import scala.collection.JavaConversions._
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag
import scala.reflect.runtime.universe.TypeTag

class IOCommon(val sc:SparkContext) {
   def load[T:ClassTag:TypeTag](filename:String, force_format:Option[String]=None) = {
     val input_format = force_format.getOrElse(
       IOCommon.getProperty("sparkbench.inputformat").getOrElse("Text"))

     input_format match {
       case "Text" =>
         sc.textFile(filename)

       case "Sequence" =>
         sc.sequenceFile[NullWritable, Text](filename).map(_._2.toString)

       case _ => throw new UnsupportedOperationException(s"Unknown inpout format: $input_format")
     }
   }

   def save(filename:String, data:RDD[_], prefix:String) = {
     val output_format = IOCommon.getProperty(prefix).getOrElse("Text")
     val output_format_codec =
       loadClassByName[CompressionCodec](IOCommon.getProperty(prefix + ".codec"))

     output_format match {
       case "Text" =>
         if (output_format_codec.isEmpty)  data.saveAsTextFile(filename)
         else data.saveAsTextFile(filename, output_format_codec.get)

       case "Sequence" =>
         val sequence_data = data.map(x => (NullWritable.get(), new Text(x.toString)))
         if (output_format_codec.isEmpty) {
           sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename)
         } else {
           sequence_data.saveAsHadoopFile[SequenceFileOutputFormat[NullWritable, Text]](filename,
             output_format_codec.get)
         }

       case _ => throw new UnsupportedOperationException(s"Unknown output format: $output_format")
     }
   }

   def save(filename:String, data:RDD[_]):Unit = save(filename, data, "sparkbench.outputformat")

   private def loadClassByName[T](name:Option[String]) = {
     if (!name.isEmpty) Some(Class.forName(name.get)
       .newInstance.asInstanceOf[T].getClass) else None
   }

   private def callMethod[T, R](obj:T, method_name:String) =
     obj.getClass.getMethod(method_name).invoke(obj).asInstanceOf[R]
 }

object IOCommon {
   private val sparkbench_conf: HashMap[String, String] =
     getPropertiesFromFile(System.getenv("SPARKBENCH_PROPERTIES_FILES"))

   def getPropertiesFromFile(filenames: String): HashMap[String, String] = {
     val result = new HashMap[String, String]
     filenames.split(',').filter(_.stripMargin.length > 0).foreach { filename =>
       val file = new File(filename)
       require(file.exists, s"Properties file $file does not exist")
       require(file.isFile, s"Properties file $file is not a normal file")

       val inReader = new InputStreamReader(new FileInputStream(file), "UTF-8")
       try {
         val properties = new Properties()
         properties.load(inReader)
         result ++= properties.stringPropertyNames()
           .map(k => (k, properties(k).trim)).toMap
       } catch {
         case e: IOException =>
           val message = s"Failed when loading Sparkbench properties file $file"
           throw new SparkException(message, e)
       } finally {
         inReader.close()
       }
     }
     result.filter{case (key, value) => value.toLowerCase != "none"}
   }

   def getProperty(key:String):Option[String] = sparkbench_conf.get(key)

   def dumpProperties(): Unit = sparkbench_conf
       .foreach{case (key, value)=> println(s"$key\t\t$value")}
 }

Source File: UniqueTermAccumulator.scala From sparkpipe-core with Apache License 2.0

5 votes

package software.uncharted.sparkpipe.ops.core.dataframe.text.util

import org.apache.spark.sql.Row
import org.apache.spark.util.AccumulatorV2
import scala.collection.mutable.HashMap


private[text] class UniqueTermAccumulator(
  private var result: HashMap[String, Int],
  private var touched: Boolean = false
) extends AccumulatorV2[Seq[String], HashMap[String, Int]] {

  def this() {
    this(new HashMap[String, Int]())
  }

  override def add(in: Seq[String]): Unit = {
    in.foreach(w => {
      result.put(w, result.getOrElse(w, 0) + 1)
    })
  }

  override def copy(): AccumulatorV2[Seq[String], HashMap[String, Int]] = {
    val clone = new HashMap[String, Int]()
    result.foreach(kv => clone.put(kv._1, kv._2))
    new UniqueTermAccumulator(clone, false)
  }

  override def isZero(): Boolean = {
    !touched
  }

  override def merge(other: AccumulatorV2[Seq[String], HashMap[String, Int]]): Unit = {
    other.value.foreach(t => {
      result.put(t._1, result.getOrElse(t._1, 0) + t._2)
    })
  }

  override def reset(): Unit = {
    result.clear
    touched = false
  }

  override def value: HashMap[String, Int] = {
    result
  }
}

Source File: Mapper.scala From Scalaprof with GNU General Public License v2.0

5 votes

package edu.neu.coe.scala.mapreduce

import akka.actor.{ Actor, ActorLogging, ActorRef }
import scala.collection.mutable.HashMap
import scala.util._


class Mapper_Forgiving[K1,V1,K2,V2](f: (K1,V1)=>(K2,V2)) extends Mapper[K1,V1,K2,V2](f) {
  
  override def prepareReply(v2k2ts: Seq[Try[(K2,V2)]]) = {
      val v2sK2m = HashMap[K2,Seq[V2]]() // mutable
      val xs = Seq[Throwable]() // mutable
      for (v2k2t <- v2k2ts; v2k2e = Master.sequence(v2k2t))
        v2k2e match {
          case Right((k2,v2)) => v2sK2m put(k2, v2+:(v2sK2m get(k2) getOrElse(Nil)))
          case Left(x) => xs :+ x
      }
      (v2sK2m.toMap, xs.toSeq)
  }
}

case class Incoming[K, V](m: Seq[(K,V)]) {
  override def toString = s"Incoming: with ${m.size} elements"
}

object Incoming {
  def sequence[K,V](vs: Seq[V]): Incoming[K,V] = Incoming((vs zip Stream.continually(null.asInstanceOf[K])).map{_.swap})
  def map[K, V](vKm: Map[K,V]): Incoming[K,V] = Incoming(vKm.toSeq)
}

object Mapper {
}

Source File: LocalKMeans.scala From drizzle-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println

Source File: JsonUtils.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import java.io.Writer

import scala.collection.mutable.HashMap
import scala.util.control.NonFatal

import org.apache.kafka.common.TopicPartition
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization


  def partitionOffsets(partitionOffsets: Map[TopicPartition, Long]): String = {
    val result = new HashMap[String, HashMap[Int, Long]]()
    partitionOffsets.foreach { case (tp, off) =>
        val parts = result.getOrElse(tp.topic, new HashMap[Int, Long])
        parts += tp.partition -> off
        result += tp.topic -> parts
    }
    Serialization.write(result)
  }
}

Source File: DStreamCheckpointData.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.streaming.dstream

import java.io.{IOException, ObjectInputStream, ObjectOutputStream}

import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.hadoop.fs.{FileSystem, Path}

import org.apache.spark.internal.Logging
import org.apache.spark.streaming.Time
import org.apache.spark.util.Utils

private[streaming]
class DStreamCheckpointData[T: ClassTag](dstream: DStream[T])
  extends Serializable with Logging {
  protected val data = new HashMap[Time, AnyRef]()

  // Mapping of the batch time to the checkpointed RDD file of that time
  @transient private var timeToCheckpointFile = new HashMap[Time, String]
  // Mapping of the batch time to the time of the oldest checkpointed RDD
  // in that batch's checkpoint data
  @transient private var timeToOldestCheckpointFileTime = new HashMap[Time, Time]

  @transient private var fileSystem: FileSystem = null
  protected[streaming] def currentCheckpointFiles = data.asInstanceOf[HashMap[Time, String]]

  
  def restore() {
    // Create RDDs from the checkpoint data
    currentCheckpointFiles.foreach {
      case(time, file) =>
        logInfo("Restoring checkpointed RDD for time " + time + " from file '" + file + "'")
        dstream.generatedRDDs += ((time, dstream.context.sparkContext.checkpointFile[T](file)))
    }
  }

  override def toString: String = {
    "[\n" + currentCheckpointFiles.size + " checkpoint files \n" +
      currentCheckpointFiles.mkString("\n") + "\n]"
  }

  @throws(classOf[IOException])
  private def writeObject(oos: ObjectOutputStream): Unit = Utils.tryOrIOException {
    logDebug(this.getClass().getSimpleName + ".writeObject used")
    if (dstream.context.graph != null) {
      dstream.context.graph.synchronized {
        if (dstream.context.graph.checkpointInProgress) {
          oos.defaultWriteObject()
        } else {
          val msg = "Object of " + this.getClass.getName + " is being serialized " +
            " possibly as a part of closure of an RDD operation. This is because " +
            " the DStream object is being referred to from within the closure. " +
            " Please rewrite the RDD operation inside this DStream to avoid this. " +
            " This has been enforced to avoid bloating of Spark tasks " +
            " with unnecessary objects."
          throw new java.io.NotSerializableException(msg)
        }
      }
    } else {
      throw new java.io.NotSerializableException(
        "Graph is unexpectedly null when DStream is being serialized.")
    }
  }

  @throws(classOf[IOException])
  private def readObject(ois: ObjectInputStream): Unit = Utils.tryOrIOException {
    logDebug(this.getClass().getSimpleName + ".readObject used")
    ois.defaultReadObject()
    timeToOldestCheckpointFileTime = new HashMap[Time, Time]
    timeToCheckpointFile = new HashMap[Time, String]
  }
}

Source File: MasterWebUI.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.master.ui

import scala.collection.mutable.HashMap

import org.eclipse.jetty.servlet.ServletContextHandler

import org.apache.spark.deploy.master.Master
import org.apache.spark.internal.Logging
import org.apache.spark.ui.{SparkUI, WebUI}
import org.apache.spark.ui.JettyUtils._


  def initialize() {
    val masterPage = new MasterPage(this)
    attachPage(new ApplicationPage(this))
    attachPage(masterPage)
    attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static"))
    attachHandler(createRedirectHandler(
      "/app/kill", "/", masterPage.handleAppKillRequest, httpMethods = Set("POST")))
    attachHandler(createRedirectHandler(
      "/driver/kill", "/", masterPage.handleDriverKillRequest, httpMethods = Set("POST")))
  }

  def addProxyTargets(id: String, target: String): Unit = {
    var endTarget = target.stripSuffix("/")
    val handler = createProxyHandler("/proxy/" + id, endTarget)
    attachHandler(handler)
    proxyHandlers(id) = handler
  }

  def removeProxyTargets(id: String): Unit = {
    proxyHandlers.remove(id).foreach(detachHandler)
  }
}

private[master] object MasterWebUI {
  private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR
}

Source File: PoolTable.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import java.net.URLEncoder

import scala.collection.mutable.HashMap
import scala.xml.Node

import org.apache.spark.scheduler.{Schedulable, StageInfo}
import org.apache.spark.ui.UIUtils


private[ui] class PoolTable(pools: Seq[Schedulable], parent: StagesTab) {
  private val listener = parent.progressListener

  def toNodeSeq: Seq[Node] = {
    listener.synchronized {
      poolTable(poolRow, pools)
    }
  }

  private def poolTable(
      makeRow: (Schedulable, HashMap[String, HashMap[Int, StageInfo]]) => Seq[Node],
      rows: Seq[Schedulable]): Seq[Node] = {
    <table class="table table-bordered table-striped table-condensed sortable table-fixed">
      <thead>
        <th>Pool Name</th>
        <th>Minimum Share</th>
        <th>Pool Weight</th>
        <th>Active Stages</th>
        <th>Running Tasks</th>
        <th>SchedulingMode</th>
      </thead>
      <tbody>
        {rows.map(r => makeRow(r, listener.poolToActiveStages))}
      </tbody>
    </table>
  }

  private def poolRow(
      p: Schedulable,
      poolToActiveStages: HashMap[String, HashMap[Int, StageInfo]]): Seq[Node] = {
    val activeStages = poolToActiveStages.get(p.name) match {
      case Some(stages) => stages.size
      case None => 0
    }
    val href = "%s/stages/pool?poolname=%s"
      .format(UIUtils.prependBaseUri(parent.basePath), URLEncoder.encode(p.name, "UTF-8"))
    <tr>
      <td>
        <a href={href}>{p.name}</a>
      </td>
      <td>{p.minShare}</td>
      <td>{p.weight}</td>
      <td>{activeStages}</td>
      <td>{p.runningTasks}</td>
      <td>{p.schedulingMode}</td>
    </tr>
  }
}

Source File: ConfigReader.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.internal.config

import java.util.{Map => JMap}
import java.util.regex.Pattern

import scala.collection.mutable.HashMap
import scala.util.matching.Regex

private object ConfigReader {

  private val REF_RE = "\\$\\{(?:(\\w+?):)?(\\S+?)\\}".r

}


  def substitute(input: String): String = substitute(input, Set())

  private def substitute(input: String, usedRefs: Set[String]): String = {
    if (input != null) {
      ConfigReader.REF_RE.replaceAllIn(input, { m =>
        val prefix = m.group(1)
        val name = m.group(2)
        val ref = if (prefix == null) name else s"$prefix:$name"
        require(!usedRefs.contains(ref), s"Circular reference in $input: $ref")

        val replacement = bindings.get(prefix)
          .flatMap(_.get(name))
          .map { v => substitute(v, usedRefs + ref) }
          .getOrElse(m.matched)
        Regex.quoteReplacement(replacement)
      })
    } else {
      input
    }
  }

}

Source File: StageInfo.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskMetrics: TaskMetrics = null,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskMetrics,
      taskLocalityPreferences)
  }
}

Source File: GroupedCountEvaluator.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}

Source File: MasterWebUISuite.scala From drizzle-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.master.ui

import java.io.DataOutputStream
import java.net.{HttpURLConnection, URL}
import java.nio.charset.StandardCharsets
import java.util.Date

import scala.collection.mutable.HashMap

import org.mockito.Mockito.{mock, times, verify, when}
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
import org.apache.spark.deploy.DeployMessages.{KillDriverResponse, RequestKillDriver}
import org.apache.spark.deploy.DeployTestUtils._
import org.apache.spark.deploy.master._
import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv}


class MasterWebUISuite extends SparkFunSuite with BeforeAndAfterAll {

  val conf = new SparkConf
  val securityMgr = new SecurityManager(conf)
  val rpcEnv = mock(classOf[RpcEnv])
  val master = mock(classOf[Master])
  val masterEndpointRef = mock(classOf[RpcEndpointRef])
  when(master.securityMgr).thenReturn(securityMgr)
  when(master.conf).thenReturn(conf)
  when(master.rpcEnv).thenReturn(rpcEnv)
  when(master.self).thenReturn(masterEndpointRef)
  val masterWebUI = new MasterWebUI(master, 0)

  override def beforeAll() {
    super.beforeAll()
    masterWebUI.bind()
  }

  override def afterAll() {
    masterWebUI.stop()
    super.afterAll()
  }

  test("kill application") {
    val appDesc = createAppDesc()
    // use new start date so it isn't filtered by UI
    val activeApp = new ApplicationInfo(
      new Date().getTime, "app-0", appDesc, new Date(), null, Int.MaxValue)

    when(master.idToApp).thenReturn(HashMap[String, ApplicationInfo]((activeApp.id, activeApp)))

    val url = s"http://localhost:${masterWebUI.boundPort}/app/kill/"
    val body = convPostDataToString(Map(("id", activeApp.id), ("terminate", "true")))
    val conn = sendHttpRequest(url, "POST", body)
    conn.getResponseCode

    // Verify the master was called to remove the active app
    verify(master, times(1)).removeApplication(activeApp, ApplicationState.KILLED)
  }

  test("kill driver") {
    val activeDriverId = "driver-0"
    val url = s"http://localhost:${masterWebUI.boundPort}/driver/kill/"
    val body = convPostDataToString(Map(("id", activeDriverId), ("terminate", "true")))
    val conn = sendHttpRequest(url, "POST", body)
    conn.getResponseCode

    // Verify that master was asked to kill driver with the correct id
    verify(masterEndpointRef, times(1)).ask[KillDriverResponse](RequestKillDriver(activeDriverId))
  }

  private def convPostDataToString(data: Map[String, String]): String = {
    (for ((name, value) <- data) yield s"$name=$value").mkString("&")
  }

  
  private def sendHttpRequest(
      url: String,
      method: String,
      body: String = ""): HttpURLConnection = {
    val conn = new URL(url).openConnection().asInstanceOf[HttpURLConnection]
    conn.setRequestMethod(method)
    if (body.nonEmpty) {
      conn.setDoOutput(true)
      conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded")
      conn.setRequestProperty("Content-Length", Integer.toString(body.length))
      val out = new DataOutputStream(conn.getOutputStream)
      out.write(body.getBytes(StandardCharsets.UTF_8))
      out.close()
    }
    conn
  }
}

Source File: CMaxTableSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.CMaxTable
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class CMaxTableSpec extends TorchSpec {
    "A CMaxTable Module" should "generate correct output and grad" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)
    val module = new CMaxTable[Double]()

    val input1 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val gradOutput = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.CMaxTable()\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input,gradOutput)"


    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be(output)
    luaOutput2 should be (gradInput)

    println("Test case : CMaxTable, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")

  }
}

Source File: L1HingeEmbeddingCriterionSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.L1HingeEmbeddingCriterion
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class L1HingeEmbeddingCriterionSpec extends TorchSpec {
    "A L1HingeEmbeddingCriterion" should "generate correct output and grad with y == 1 " in {
    torchCheck()
    val seed = 2
    RNG.setSeed(seed)
    val module = new L1HingeEmbeddingCriterion[Double](0.6)

    val input1 = Tensor[Double](2).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](2).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.0) = input1
    input(2.0) = input2

    val target = Tensor[Double](1)
    target(Array(1)) = 1.0

    val start = System.nanoTime()
    val output = module.forward(input, target)
    val gradInput = module.backward(input, target)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.L1HingeEmbeddingCriterion(0.6)\n" +
      "output = module:forward(input, 1)\n" +
      "gradInput = module:backward(input, 1)\n"

    val (luaTime, torchResult) = TH.run(code, Map("input" -> input), Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Double]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be(output)
    luaOutput2 should be (gradInput)

    println("Test case : L1HingeEmbeddingCriterion, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }

  "A L1HingeEmbeddingCriterion" should "generate correct output and grad with y == -1 " in {
    torchCheck()
    val seed = 2
    RNG.setSeed(seed)
    val module = new L1HingeEmbeddingCriterion[Double](0.6)

    val input1 = Tensor[Double](2).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](2).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.0) = input1
    input(2.0) = input2

    val target = Tensor[Double](1)
    target(Array(1)) = -1.0

    val start = System.nanoTime()
    val output = module.forward(input, target)
    val gradInput = module.backward(input, target)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.L1HingeEmbeddingCriterion(0.6)\n" +
      "output = module:forward(input, -1.0)\n" +
      "gradInput = module:backward(input, -1.0)\n"

    val (luaTime, torchResult) = TH.run(code, Map("input" -> input), Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Double]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be(output)
    luaOutput2 should be (gradInput)

    println("Test case : L1HingeEmbeddingCriterion, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: CDivTableSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.CDivTable
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class CDivTableSpec extends TorchSpec {
    "A CDivTable Module" should "generate correct output and grad" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)
    val module = new CDivTable[Double]()

    val input1 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val gradOutput = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.CDivTable()\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input,gradOutput)"


    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be (output)
    luaOutput2 should be (gradInput)

    println("Test case : CDivTable, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: CMulTableSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.Table
import com.intel.analytics.bigdl.nn.CMulTable

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class CMulTableSpec extends TorchSpec {
    "A CMulTable Module" should "generate correct output and grad" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)
    val module = new CMulTable[Double]()

    val input1 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val gradOutput = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.CMulTable()\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input,gradOutput)"


    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be (output)
    luaOutput2 should be (gradInput)

    println("Test case : CMinTable, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")

  }
}

Source File: CosineDistanceSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.CosineDistance
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class CosineDistanceSpec extends TorchSpec {
    "A CosineDistance " should "generate correct output and grad" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)

    val input1 = Tensor[Double](3).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](3).apply1(e => Random.nextDouble())
    val gradOutput = Tensor[Double](1).apply1(e => Random.nextDouble())

    val input = new Table()
    input(1.0) = input1
    input(2.0) = input2

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.CosineDistance()\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input,gradOutput)\n"

    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))

    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    val module = new CosineDistance[Double]()
    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    output should be(luaOutput1)
    luaOutput2 should be (gradInput)

    println("Test case : CosineDistance, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: CosineEmbeddingCriterionSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.CosineEmbeddingCriterion
import com.intel.analytics.bigdl.tensor.{Storage, Tensor}
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.{RandomGenerator, Table}

import scala.collection.mutable.HashMap

import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class CosineEmbeddingCriterionSpec extends TorchSpec {
    "A CosineEmbeddingCriterion Module" should "generate correct output and grad" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)
    val module = new CosineEmbeddingCriterion[Double](0.2)

    val input1 = Tensor[Double](5).apply1(e => RandomGenerator.RNG.uniform(0, 2))
    val input2 = Tensor[Double](5).apply1(e => RandomGenerator.RNG.uniform(0, 1))
    val input = new Table()
    input(1.0) = input1
    input(2.0) = input2

    val target = new Table()
    val target1 = Tensor[Double](Storage(Array(-0.5)))
    target(1.toDouble) = target1

    val start = System.nanoTime()
    val output = module.forward(input, target)
    val gradInput = module.backward(input, target)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.CosineEmbeddingCriterion(0.2)\n" +
      "_idx = module._idx\n" +
      "_outputs = module._outputs\n" +
      "buffer = module.buffer\n" +
      "output = module:forward(input, -0.5)\n" +
      "gradInput = module:backward(input, -0.5)\n"


    val (luaTime, torchResult) = TH.run(code, Map("input" -> input),
      Array("output", "gradInput", "_idx", "buffer", "_outputs"))
    val luaOutput1 = torchResult("output").asInstanceOf[Double]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be(output)
    luaOutput2 should be (gradInput)

    println("Test case : CrossEntropyCriterion, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: CosineDistanceCriterionSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.CosineDistanceCriterion
import com.intel.analytics.bigdl.tensor.{Storage, Tensor}
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.{RandomGenerator, Table}

import scala.collection.mutable.HashMap

import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class CosineDistanceCriterionSpec extends TorchSpec {
  "A CosineDistanceCriterionSpec Module" should "generate correct output and grad" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)
    val module = CosineDistanceCriterion[Double](false)

    val input1 = Tensor[Double](5).apply1(e => RandomGenerator.RNG.uniform(0, 2))
    val input2 = Tensor[Double](5).apply1(e => RandomGenerator.RNG.uniform(0, 1))
    val input = new Table()
    input(1.0) = input1
    input(2.0) = input2

    val target = new Table()
    val target1 = Tensor[Double](Storage(Array(1.0)))
    target(1.toDouble) = target1

    val start = System.nanoTime()
    val output = module.forward(input1, input2)
    val gradInput = module.backward(input1, input2)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.CosineEmbeddingCriterion(0.0)\n" +
      "_idx = module._idx\n" +
      "_outputs = module._outputs\n" +
      "buffer = module.buffer\n" +
      "output = module:forward(input, 1.0)\n" +
      "gradInput = module:backward(input, 1.0)\n"


    val (luaTime, torchResult) = TH.run(code, Map("input" -> input),
      Array("output", "gradInput", "_idx", "buffer", "_outputs"))
    val luaOutput1 = torchResult("output").asInstanceOf[Double]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be(output)
    luaOutput2[Tensor[Double]](1) should be (gradInput.squeeze())

    println("Test case : CrossEntropyCriterion, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: NarrowTableSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.NarrowTable
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.{T, Table}

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class NarrowTableSpec extends TorchSpec {
    "A NarrowTable Module " should "generate correct output and grad" in {
    torchCheck()
    val module = new NarrowTable[Double](1, 2)

    val input = T()
    input(1.0) = Tensor[Double](2, 3).apply1(e => Random.nextDouble())
    input(2.0) = Tensor[Double](2, 1).apply1(e => Random.nextDouble())
    input(3.0) = Tensor[Double](2, 2).apply1(e => Random.nextDouble())

    val gradOutput = T()
    gradOutput(1.0) = Tensor[Double](5, 3).apply1(e => Random.nextDouble())
    gradOutput(2.0) = Tensor[Double](2, 5).apply1(e => Random.nextDouble())

    val code = "module = nn.NarrowTable(1, 2)\n" +
      "local i = 0\n" +
      "while i < 10 do\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input, gradOutput)\n" +
      "i = i + 1\n" +
      "end"

    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))

    val luaOutput1 = torchResult("output").asInstanceOf[Table]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    val start = System.nanoTime()
    var i = 0
    var output = T()
    var gradInput = T()
    while (i < 10) {
      output = module.forward(input)
      gradInput = module.backward(input, gradOutput)
      i += 1
    }
    val end = System.nanoTime()
    val scalaTime = end - start

    luaOutput1 should be (output)
    luaOutput2 should be (gradInput)

    println("Test case : NarrowTable, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }

  "A NarrowTable Module with negative length" should "generate correct output and grad" in {
    torchCheck()
    val module = new NarrowTable[Double](2, -2)

    val input = T()
    input(1.0) = Tensor[Double](2, 3).apply1(e => Random.nextDouble())
    input(2.0) = Tensor[Double](2, 1).apply1(e => Random.nextDouble())
    input(3.0) = Tensor[Double](2, 2).apply1(e => Random.nextDouble())
    input(4.0) = Tensor[Double](2, 2).apply1(e => Random.nextDouble())

    val gradOutput = T()
    gradOutput(1.0) = Tensor[Double](5, 3).apply1(e => Random.nextDouble())
    gradOutput(2.0) = Tensor[Double](2, 5).apply1(e => Random.nextDouble())

    val start = System.nanoTime()
    var i = 0
    var output = T()
    var gradInput = T()
    output = module.forward(input)
    gradInput = module.backward(input, gradOutput)
    i += 1
    val end = System.nanoTime()
    val scalaTime = end - start

    val gradInput1 = gradInput[Tensor[Double]](2.0)
    val gradInput2 = gradInput[Tensor[Double]](3.0)
    val expectedGradInput1 = gradOutput[Tensor[Double]](1.0)
    val expectedGradInput2 = gradOutput[Tensor[Double]](2.0)

    val output1 = output[Tensor[Double]](1.0)
    val output2 = output[Tensor[Double]](2.0)
    val expectedOutput1 = input[Tensor[Double]](2.0)
    val expectedOutput2 = input[Tensor[Double]](3.0)

    output1 should be (expectedOutput1)
    output2 should be (expectedOutput2)

    gradInput1 should be (expectedGradInput1)
    gradInput2 should be (expectedGradInput2)
  }
}

Source File: MarginRankingCriterionSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch
import com.intel.analytics.bigdl.nn.MarginRankingCriterion
import com.intel.analytics.bigdl.tensor.{Storage, Tensor}
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class MarginRankingCriterionSpec extends TorchSpec {
    "A MarginRankingCriterion " should "generate correct output and grad with only value" in {
    torchCheck()
    val mse = new MarginRankingCriterion[Double]()

    val input1 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](5).apply1(e => Random.nextDouble())

    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val target = new Table()
    val target1 = Tensor[Double](Storage(Array(-1.0)))
    target(1.toDouble) = target1

    val start = System.nanoTime()
    val output = mse.forward(input, target)
    val gradInput = mse.backward(input, target)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "mse = nn.MarginRankingCriterion()\n" +
      "output = mse:forward(input,-1)\n" +
      "gradInput = mse:backward(input,-1)"

    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "target" -> target),
      Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Double]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be (output)
    gradInput should equal (luaOutput2)

    println("Test case : MarginRankingCriterion, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }

  "A MarginRankingCriterion " should "generate correct output and grad with Tensor target" in {
    torchCheck()
    val mse = new MarginRankingCriterion[Double]()

    val input1 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](5).apply1(e => Random.nextDouble())

    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val target = new Table()
    val target1 = Tensor[Double](5).apply1(e => Random.nextDouble())
    target(1.toDouble) = target1

    val start = System.nanoTime()
    val output = mse.forward(input, target)
    val gradInput = mse.backward(input, target)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "mse = nn.MarginRankingCriterion()\n" +
      "output = mse:forward(input, target)\n" +
      "gradInput = mse:backward(input, target)"

    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "target" -> target1),
      Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Double]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be (output)
    gradInput should equal (luaOutput2)

    println("Test case : MarginRankingCriterion, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: MaskedSelectSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.MaskedSelect
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class MaskedSelectSpec extends TorchSpec {
    "A MaskedSelect Module " should "generate correct output and grad" in {
    torchCheck()
    val module = new MaskedSelect[Double]()
    val input1 = Tensor[Double](2, 2).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](2, 2)
    input2(Array(1, 1)) = 1
    input2(Array(1, 2)) = 0
    input2(Array(2, 1)) = 0
    input2(Array(2, 2)) = 1

    val input = new Table()
    input(1.0) = input1
    input(2.0) = input2

    val gradOutput = Tensor[Double](5).apply1(e => Random.nextDouble())

    val code = "module = nn.MaskedSelect()\n" +
      "mask = torch.ByteTensor({{1, 0}, {0, 1}})\n" +
      "output = module:forward({input1, mask})\n" +
      "gradInput = module:backward({input1, mask}, gradOutput)\n" +
      "gradInput[2] = gradInput[2]:double()"

    val (luaTime, torchResult) = TH.run(code, Map("input1" -> input1, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))

    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    output should be (luaOutput1)
    gradInput should equal (luaOutput2)

    println("Test case : MaskedSelect, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }

}

Source File: CMinTableSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.CMinTable
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class CMinTableSpec extends TorchSpec {
    "A CMaxTable Module" should "generate correct output and grad" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)
    val module = new CMinTable[Double]()

    val input1 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val gradOutput = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.CMinTable()\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input,gradOutput)\n"

    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be (output)
    luaOutput2 should be (gradInput)

    println("Test case : CMinTable, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: MixtureTableSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.MixtureTable
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class MixtureTableSpec extends TorchSpec {
    "A MixtureTable " should "generate correct output and grad with table expertInput" in {
    torchCheck()
    val mse = new MixtureTable[Double]

    val expertInput = Tensor[Double](5, 3, 6).apply1(e => Random.nextDouble())
    val expertTable = new Table()
    expertTable(1.0) = Tensor[Double](5, 6).apply1(e => Random.nextDouble())
    expertTable(2.0) = Tensor[Double](5, 6).apply1(e => Random.nextDouble())
    expertTable(3.0) = Tensor[Double](5, 6).apply1(e => Random.nextDouble())

    val input1 = Tensor[Double](5, 3).apply1(e => Random.nextDouble())
    val gradOutput = Tensor[Double](5, 6).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.0) = input1
    input(2.0) = expertTable

    val start = System.nanoTime()
    val output = mse.forward(input)
    val gradInput = mse.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "mse = nn.MixtureTable()\n" +
      "input = {input1, expertTable}\n" +
      "output = mse:forward(input)\n" +
      "gradInput = mse:backward(input,gradOutput)"

    val (luaTime, torchResult) = TH.run(code, Map("input1" -> input1, "expertTable" -> expertTable,
      "gradOutput" -> gradOutput), Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    output should be (luaOutput1)
    luaOutput2 should be (gradInput)

    println("Test case : MixtureTable, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }

  "A MixtureTable " should "generate correct output and grad with tensor expertInput" in {
    torchCheck()
    val mse = new MixtureTable[Double]

    val expertInput = Tensor[Double](5, 3, 6).apply1(e => Random.nextDouble())
    val input1 = Tensor[Double](5, 3).apply1(e => Random.nextDouble())
    val gradOutput = Tensor[Double](5, 6).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.0) = input1
    input(2.0) = expertInput

    val code = "mse = nn.MixtureTable()\n" +
      "output = mse:forward(input)\n" +
      "gradInput = mse:backward(input,gradOutput)\n" +
      "size = mse.size\n" +
      "dim = mse.dim"

    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "gradOutput" -> gradOutput),
      Array("output", "gradInput", "size", "dim"))
    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    val start = System.nanoTime()
    val output = mse.forward(input)
    val gradInput = mse.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    output should be (luaOutput1)
    gradInput should be (luaOutput2)

    println("Test case : MixtureTable, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: IndexSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.nn.Index
import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.Table

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class IndexSpec extends TorchSpec {
    "A Index " should "generate correct output and grad with one dimension" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)

    val input1 = Tensor[Double](3).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](4)
    input2(Array(1)) = 1
    input2(Array(2)) = 2
    input2(Array(3)) = 2
    input2(Array(4)) = 3
    val gradOutput = Tensor[Double](4).apply1(e => Random.nextDouble())

    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val code = "torch.manualSeed(" + seed + ")\n" +
      "input = {input1, torch.LongTensor{1, 2, 2, 3}}\n" +
      "module = nn.Index(1)\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input,gradOutput)\n"

    val (luaTime, torchResult) = TH.run(code, Map("input1" -> input1, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))

    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    val module = new Index[Double](1)
    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    output should be(luaOutput1)
    luaOutput2 should be (gradInput)

    println("Test case : Index, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }

  "A Index " should "generate correct output and grad with two dimension" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)

    val input1 = Tensor[Double](3, 3).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](4)
    input2(Array(1)) = 1
    input2(Array(2)) = 2
    input2(Array(3)) = 3
    input2(Array(4)) = 1

    val gradOutput = Tensor[Double](3, 4).apply1(e => Random.nextDouble())

    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val code = "torch.manualSeed(" + seed + ")\n" +
      "input = {input1, torch.LongTensor{1, 2, 3, 1}}\n" +
      "module = nn.Index(2)\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input,gradOutput)\n"

    val (luaTime, torchResult) = TH.run(code, Map("input1" -> input1, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))

    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    val module = new Index[Double](2)
    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    output should be(luaOutput1)
    luaOutput2 should be (gradInput)

    println("Test case : Index, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }


}

Source File: CSubTableSpec.scala From BigDL with Apache License 2.0

5 votes

package com.intel.analytics.bigdl.integration.torch

import com.intel.analytics.bigdl.tensor.Tensor
import com.intel.analytics.bigdl.utils.RandomGenerator._
import com.intel.analytics.bigdl.utils.Table
import com.intel.analytics.bigdl.nn.CSubTable

import scala.collection.mutable.HashMap
import scala.util.Random

@com.intel.analytics.bigdl.tags.Serial
class CSubTableSpec extends TorchSpec {
    "A CDivTable Module" should "generate correct output and grad" in {
    torchCheck()
    val seed = 100
    RNG.setSeed(seed)
    val module = new CSubTable[Double]()

    val input1 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input2 = Tensor[Double](5).apply1(e => Random.nextDouble())
    val gradOutput = Tensor[Double](5).apply1(e => Random.nextDouble())
    val input = new Table()
    input(1.toDouble) = input1
    input(2.toDouble) = input2

    val start = System.nanoTime()
    val output = module.forward(input)
    val gradInput = module.backward(input, gradOutput)
    val end = System.nanoTime()
    val scalaTime = end - start

    val code = "torch.manualSeed(" + seed + ")\n" +
      "module = nn.CSubTable()\n" +
      "output = module:forward(input)\n" +
      "gradInput = module:backward(input,gradOutput)"


    val (luaTime, torchResult) = TH.run(code, Map("input" -> input, "gradOutput" -> gradOutput),
      Array("output", "gradInput"))
    val luaOutput1 = torchResult("output").asInstanceOf[Tensor[Double]]
    val luaOutput2 = torchResult("gradInput").asInstanceOf[Table]

    luaOutput1 should be(output)
    luaOutput2 should be (gradInput)

    println("Test case : CSubTable, Torch : " + luaTime +
      " s, Scala : " + scalaTime / 1e9 + " s")
  }
}

Source File: KeyBinder.scala From slide-desktop with GNU General Public License v2.0

5 votes

package gui

import java.awt.event.{KeyEvent, KeyListener}
import scala.collection.mutable.HashMap

abstract class KeyBinder(val keyCodes: Int*) extends KeyListener {
    private val keyMap: HashMap[Int, Boolean] = new HashMap[Int, Boolean]

    override def keyTyped(e: KeyEvent): Unit = {}

    override def keyPressed(e: KeyEvent): Unit = {
        keyMap.put(e.getKeyCode, true)
        if (getKeysDown)
            onKeysDown()
    }

    override def keyReleased(e: KeyEvent): Unit =
        keyMap.remove(e.getKeyCode)

    private def getKeysDown: Boolean = {
        this.keyCodes.foreach(key =>
            if (keyMap.contains(key)) {
                if (!keyMap.get(key).get)
                    return false
            }
            else
                return false
        )
        keyMap.clear()

        true
    }

    def onKeysDown(): Unit
}

Source File: TopElementsAggregator.scala From salt-core with Apache License 2.0

5 votes

package software.uncharted.salt.core.analytic.collection

import software.uncharted.salt.core.analytic.Aggregator

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.collection.mutable.ListBuffer
import scala.collection.mutable.{Map => MutableMap}
import scala.collection.mutable.PriorityQueue
import scala.reflect.ClassTag


class TopElementsAggregator[ET: ClassTag](elementLimit: Int)
extends Aggregator[Seq[ET], Map[ET, Int], List[(ET, Int)]] {

  def default(): Map[ET, Int] = {
    Map[ET, Int]()
  }

  override def add(current: Map[ET, Int], next: Option[Seq[ET]]): Map[ET, Int] = {
    if (next.isDefined) {
      // If our current map is mutable, add new data in directly.
      // If not, convert to a mutable map, and then add data in
      val sum = current match {
        case hm: MutableMap[ET, Int] => hm
        case _ => {
          // The current value isn't itself a mutable hashmap yet; convert to one.
          val hm = new HashMap[ET, Int]()
          hm ++= current
          hm
        }
      }
      next.get.foreach(t => sum.put(t, sum.getOrElse(t, 0) + 1))
      sum
    } else {
      current
    }
  }

  override def merge(left: Map[ET, Int], right: Map[ET, Int]): Map[ET, Int] = {
    // If either input map is mutable, merge the other into it.
    // If neither is, convert one to mutable, and add the other into it.
    val (to, from) = left match {
      case hm: MutableMap[ET, Int] => (hm, right)
      case _ =>
        right match {
          case hm: MutableMap[ET, Int] => (hm, left)
          case _ =>
            val hm = new HashMap[ET, Int]()
            hm ++= left
            (hm, right)
        }
    }
    from.foreach(t => {
      to.put(t._1, to.getOrElse(t._1, 0) + t._2)
    })
    to
  }

  override def finish(intermediate: Map[ET, Int]): List[(ET, Int)] = {
    val x = new PriorityQueue[(ET, Int)]()(Ordering.by(
      a => a._2
    ))
    intermediate.foreach(t => {
      x.enqueue(t)
    })
    var result = new ListBuffer[(ET, Int)]
    for (i <- 0 until Math.min(elementLimit, x.size)) {
      result.append(x.dequeue)
    }
    result.toList
  }
}

Source File: ConfManager.scala From HadoopLearning with MIT License

5 votes

package com.utils

import java.util.regex.Pattern

import org.apache.kafka.clients.consumer.ConsumerConfig
import org.apache.kafka.common.serialization.StringDeserializer

import scala.collection.mutable.HashMap

/**
  * 描述 Spark Streaming 配置
  *
  * @author liumm
  * @since 2018-07-27 20:27
  */
object ConfManager {

  /**
    * 每次入库最大记录数量
    */
  val maxRecords = 1000

  /**
    * 配置Kafka
    *
    * @param streamConf
    * @return
    */
  def kafkaParam(streamConf: StreamConf): (Map[String, Object], Pattern) = {
    (getConsumerConfig(streamConf.brokers, streamConf.groupId), Pattern.compile(streamConf.topics))
  }

  def kafkaParamForMetadata(streamConf: StreamConf): Map[String, String] = {
    val kafkaParams = new HashMap[String, String]()
    kafkaParams += (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> streamConf.brokers)
    kafkaParams += ("metadata.broker.list" -> streamConf.brokers)
    kafkaParams += (ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "smallest")
    kafkaParams += (ConsumerConfig.GROUP_ID_CONFIG -> streamConf.groupId)
    kafkaParams.toMap
  }

  /**
    * 生成Kafka的Consumer配置信息
    *
    * @return Kafka的Consumer配置信息
    */
  private def getConsumerConfig(brokers: String, groupId: String): Map[String, Object] = {
    val kafkaParams = new HashMap[String, Object]()

    kafkaParams += (ConsumerConfig.BOOTSTRAP_SERVERS_CONFIG -> brokers)
    kafkaParams += (ConsumerConfig.GROUP_ID_CONFIG -> groupId)
    kafkaParams += (ConsumerConfig.KEY_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])
    kafkaParams += (ConsumerConfig.VALUE_DESERIALIZER_CLASS_CONFIG -> classOf[StringDeserializer])

    kafkaParams += (ConsumerConfig.MAX_PARTITION_FETCH_BYTES_CONFIG -> new Integer(3 * 1024 * 1024))
    kafkaParams += (ConsumerConfig.MAX_POLL_RECORDS_CONFIG -> new Integer(100))

    kafkaParams += (ConsumerConfig.AUTO_OFFSET_RESET_CONFIG -> "latest")
    //关闭kafka自动提交offset方式
    kafkaParams += (ConsumerConfig.ENABLE_AUTO_COMMIT_CONFIG -> (false: java.lang.Boolean))

    kafkaParams.toMap
  }

  def newStreamConf() = {
    val conf = new StreamConf()
    conf.zkUrl = "hdp01:2181"
    conf.brokers = "hdp01:9092"
    conf.groupId = "liumm_group"
    conf.topics = "i57_.*"
    conf
  }

}

Source File: GlobalPerformer.scala From incubator-retired-iota with Apache License 2.0

5 votes

package org.apache.iota.fey

import akka.actor.SupervisorStrategy.Restart
import akka.actor.{Actor, ActorLogging, ActorRef, OneForOneStrategy, Props, Terminated}
import akka.routing._
import play.api.libs.json.JsObject

import scala.collection.mutable.HashMap
import scala.concurrent.duration._

protected class GlobalPerformer(val orchestrationID: String,
                                val orchestrationName: String,
                                val globalPerformers: List[JsObject],
                                val ensemblesSpec :  List[JsObject]) extends Actor with ActorLogging{

  val monitoring_actor = FEY_MONITOR.actorRef
  var global_metadata: Map[String, Performer] = Map.empty[String, Performer]

  override def receive: Receive = {

    case GlobalPerformer.PRINT_GLOBAL =>
      context.actorSelection(s"*") ! FeyGenericActor.PRINT_PATH

    case Terminated(actor) =>
      monitoring_actor  ! Monitor.TERMINATE(actor.path.toString, Utils.getTimestamp)
      log.error(s"DEAD Global Performers ${actor.path.name}")
      context.children.foreach{ child =>
        context.unwatch(child)
        context.stop(child)
      }
      throw new RestartGlobalPerformers(s"DEAD Global Performer ${actor.path.name}")

    case GetRoutees => //Discard

    case x => log.warning(s"Message $x not treated by Global Performers")
  }

  
  private def loadClazzFromJar(classPath: String, jarLocation: String, jarName: String):Class[FeyGenericActor] = {
    try {
      Utils.loadActorClassFromJar(jarLocation,classPath,jarName)
    }catch {
      case e: Exception =>
        log.error(e,s"Could not load class $classPath from jar $jarLocation. Please, check the Jar repository path as well the jar name")
        throw e
    }
  }

}

object GlobalPerformer{

  val activeGlobalPerformers:HashMap[String, Map[String, ActorRef]] = HashMap.empty[String, Map[String, ActorRef]]

  case object PRINT_GLOBAL
}

Source File: TokenAuthorizingInterceptor.scala From meteorite-core with Apache License 2.0

5 votes

package bi.meteorite.core.security.authorization

import java.lang.reflect.Method
import org.apache.cxf.security.SecurityContext

import java.util

import scala.collection.mutable.HashMap
import scala.collection.mutable.ListBuffer

import scala.collection.JavaConversions._
import TokenAuthorizingInterceptor._
import scala.collection.JavaConverters._

object TokenAuthorizingInterceptor {

  private def parseRolesMap(rolesMap: Map[String, String]): scala.collection.mutable.HashMap[String, List[String]] = {
    val map = new scala.collection.mutable.HashMap[String, List[String]]()
    for ((key, value) <- rolesMap) {
      map.put(key, value.split(" ").toList)
    }
    map
  }
}


class TokenAuthorizingInterceptor(uniqueId: Boolean) extends TokenAbstractAutorizingInInterceptor(uniqueId) {

  private val methodRolesMap = new HashMap[String, List[String]]()

  private var userRolesMap = new scala.collection.mutable.HashMap[String, List[String]]

  private var globalRoles =  new scala.collection.mutable.ListBuffer[String]

  private var checkConfiguredRolesOnly: Boolean = _

  def this() {
    this(true)
  }

  protected override def isUserInRole(sc: SecurityContext, roles: util.List[String], deny: Boolean): Boolean = {
    if (!checkConfiguredRolesOnly && !super.isUserInRole(sc, roles, deny)) {
      return false
    }
    if (userRolesMap.nonEmpty) {
      val userRoles = userRolesMap.get(sc.getUserPrincipal.getName)
      if (userRoles == null) {
        return false
      }
      for (role <- roles if userRoles.get.contains(role)) {
        return true
      }
      false
    } else {
      !checkConfiguredRolesOnly
    }
  }

  private def createMethodSig(method: Method): String = {
    val b = new StringBuilder(method.getReturnType.getName)
    b.append(' ').append(method.getName).append('(')
    for (cls <- method.getParameterTypes) {
      b.append(cls.getName)
    }
    b.append(')')
    b.toString

    method.getName
  }

  protected override def getExpectedRoles(method: Method): util.List[String] = {

    var roles = methodRolesMap.get(createMethodSig(method))

    if(roles.isEmpty) {
      roles = methodRolesMap.get(method.getName)
    }

    if(roles.isEmpty){
      globalRoles.toList
    }
    else{
      roles.get
    }

  }

  def setMethodRolesMap(rolesMap: java.util.Map[String, String]) =
    methodRolesMap.putAll(parseRolesMap(rolesMap.asScala.toMap))

  def setUserRolesMap(rolesMap: java.util.Map[String, String]) = userRolesMap = parseRolesMap(rolesMap.asScala.toMap)

  def setGlobalRoles(roles: String) = globalRoles = roles.split(" ").to[ListBuffer]

  def setCheckConfiguredRolesOnly(checkConfiguredRolesOnly: Boolean) = this.checkConfiguredRolesOnly =
    checkConfiguredRolesOnly

}

Source File: KernelMatrix.scala From keystone with Apache License 2.0

5 votes

package keystoneml.nodes.learning

import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import breeze.linalg._

import org.apache.spark.rdd.RDD

import keystoneml.utils.{MatrixUtils, Stats}
import keystoneml.workflow.{Transformer, LabelEstimator}


class BlockKernelMatrix[T: ClassTag](
    val kernelGen: KernelTransformer[T],
    val data: RDD[T],
    val cacheKernel: Boolean)
  extends KernelMatrix {

  val colBlockCache = HashMap.empty[Seq[Int], RDD[DenseMatrix[Double]]]
  val diagBlockCache = HashMap.empty[Seq[Int], DenseMatrix[Double]]

  def apply(colIdxs: Seq[Int]): RDD[DenseMatrix[Double]] = {
    if (colBlockCache.contains(colIdxs)) {
      colBlockCache(colIdxs)
    } else {
      val (kBlock, diagBlock) = kernelGen.computeKernel(data, colIdxs)
      if (cacheKernel) {
        colBlockCache += (colIdxs -> kBlock)
        diagBlockCache += (colIdxs -> diagBlock)
      }
      kBlock
    }
  }

  def unpersist(colIdxs: Seq[Int]): Unit = {
    if (colBlockCache.contains(colIdxs) && !cacheKernel) {
      colBlockCache(colIdxs).unpersist(true)
    }
  }

  def diagBlock(idxs: Seq[Int]): DenseMatrix[Double] = {
    if (!diagBlockCache.contains(idxs)) {
      val (kBlock, diagBlock) = kernelGen.computeKernel(data, idxs)
      if (cacheKernel) {
        colBlockCache += (idxs -> kBlock)
        diagBlockCache += (idxs -> diagBlock)
      }
      diagBlock
    } else {
      diagBlockCache(idxs)
    }
  }
}

Source File: ParameterTest.scala From maha with Apache License 2.0

5 votes

// Copyright 2017, Yahoo Holdings Inc.
// Licensed under the terms of the Apache License 2.0. Please see LICENSE file in project root for terms.
package com.yahoo.maha.core.request

import com.yahoo.maha.core.request.ReportFormatType.CSVFormat
import com.yahoo.maha.core.{Engine, HiveEngine}
import org.json4s._
import org.json4s.jackson.JsonMethods
import org.scalatest.{FunSuite, Matchers}

import scala.collection.mutable.HashMap


class ParameterTest extends FunSuite with Matchers {

  test("SerializeParameters should serialize a map of parameters into a List") {
    val map_parameters = new HashMap[Parameter, ParameterValue[_]]
    map_parameters.put(Parameter.ReportFormat, ReportFormatValue(ReportFormatType.CSVFormat))
    map_parameters.put(Parameter.DryRun, DryRunValue(false))
    map_parameters.put(Parameter.GeneratedQuery, GeneratedQueryValue("GeneratedQuery"))
    map_parameters.put(Parameter.QueryEngine, QueryEngineValue(Engine.from("hive").get))
    map_parameters.put(Parameter.Debug, DebugValue(true))
    map_parameters.put(Parameter.RequestId, RequestIdValue("RequestId"))
    map_parameters.put(Parameter.UserId, UserIdValue("UserId"))
    map_parameters.put(Parameter.TimeZone, TimeZoneValue("TimeZone"))
    map_parameters.put(Parameter.Schema, SchemaValue("Schema"))
    map_parameters.put(Parameter.Distinct, DistinctValue(true))
    map_parameters.put(Parameter.JobName, JobNameValue("tools_1"))
    map_parameters.put(Parameter.RegistryName, RegistryNameValue("mahaRegistry"))
    map_parameters.put(Parameter.HostName, HostNameValue("127.0.0.1"))

    val result = Parameter.serializeParameters(map_parameters.toMap)
    result.length shouldBe map_parameters.size

    val newMap = result.map(t=> t._1 -> t._2).toMap
    for((k,v) <- map_parameters) {
      newMap.get(k.entryName).get match{
        case JString(x) => v.value match {
          case CSVFormat => x shouldBe "csv"
          case HiveEngine => x shouldBe "Hive"
          case _ => x shouldBe v.value
        }
        case JBool(x) => x shouldBe v.value
        case _ => fail
      }
    }

  }

  test("DeserializeParameters should deserialize a JSON into a Map of parameter values") {
    val inputJson=
      """
        |{
        | "Report-Format": "csv",
        | "Dry-Run": false,
        | "Generated-Query": "Generated-Query",
        | "Query-Engine": "oracle",
        | "debug": true,
        | "Request-Id": "Request-Id",
        | "User-Id": "User-Id",
        | "TimeZone": "TimeZone",
        | "Schema": "Schema",
        | "Distinct": true,
        | "Job-Name": "Job-Name",
        | "RegistryName": "mahaRegistry",
        | "HostName": "127.0.0.1"
        |}
        |""".stripMargin
    val result = Parameter.deserializeParameters(JsonMethods.parse(inputJson))

    result.getOrElse() match{
      case m: Map[Parameter, ParameterValue[_]] => {
        m.size shouldBe 13
        m.get(Parameter.ReportFormat).get shouldBe ReportFormatValue(ReportFormatType.CSVFormat)
        m.get(Parameter.DryRun).get shouldBe DryRunValue(false)
        m.get(Parameter.GeneratedQuery).get shouldBe GeneratedQueryValue("Generated-Query")
        m.get(Parameter.QueryEngine).get shouldBe QueryEngineValue(Engine.from("oracle").get)
        m.get(Parameter.Debug).get shouldBe DebugValue(true)
        m.get(Parameter.RequestId).get shouldBe RequestIdValue("Request-Id")
        m.get(Parameter.UserId).get shouldBe UserIdValue("User-Id")
        m.get(Parameter.TimeZone).get shouldBe TimeZoneValue("TimeZone")
        m.get(Parameter.Schema).get shouldBe SchemaValue("Schema")
        m.get(Parameter.Distinct).get shouldBe DistinctValue(true)
        m.get(Parameter.JobName).get shouldBe JobNameValue("Job-Name")
        m.get(Parameter.RegistryName).get shouldBe RegistryNameValue("mahaRegistry")
        m.get(Parameter.HostName).get shouldBe HostNameValue("127.0.0.1")
      }
      case _ => fail
    }

  }

}

Source File: depgraph.scala From sbt-blockade with Apache License 2.0

5 votes

//: ----------------------------------------------------------------------------
//: Copyright 2015 Johannes Rudolph
//:
//: Distributed under the Apache 2.0 License, please see the NOTICE
//: file in the root of the project for further details.
//: ----------------------------------------------------------------------------
package verizon.build

object depgraph {

  import java.io.File
  import sbt._
  import scala.collection.mutable.{HashMap, MultiMap, Set}
  import scala.language.reflectiveCalls

  object SbtUpdateReport {

    type OrganizationArtifactReport = {
      def modules: Seq[ModuleReport]
    }

    def fromConfigurationReport(report: ConfigurationReport, rootInfo: sbt.ModuleID): ModuleGraph = {
      implicit def id(sbtId: sbt.ModuleID): ModuleId = ModuleId(sbtId.organization, sbtId.name, sbtId.revision)

      def moduleEdges(orgArt: OrganizationArtifactReport): Seq[(Module, Seq[Edge])] = {
        val chosenVersion = orgArt.modules.find(!_.evicted).map(_.module.revision)
        orgArt.modules.map(moduleEdge(chosenVersion))
      }

      def moduleEdge(chosenVersion: Option[String])(report: ModuleReport): (Module, Seq[Edge]) = {
        val evictedByVersion = if (report.evicted) chosenVersion else None
        val jarFile = report.artifacts.find(_._1.`type` == "jar").orElse(report.artifacts.find(_._1.extension == "jar")).map(_._2)
        (Module(
          id = report.module,
          license = report.licenses.headOption.map(_._1),
          evictedByVersion = evictedByVersion,
          jarFile = jarFile,
          error = report.problem
        ), report.callers.map(caller ⇒ Edge(caller.caller, report.module)))
      }

      val (nodes, edges) = report.details.flatMap(moduleEdges).unzip
      val root = Module(rootInfo)

      ModuleGraph(root +: nodes, edges.flatten)
    }
  }

  type Edge = (ModuleId, ModuleId)

  def Edge(from: ModuleId, to: ModuleId): Edge = from -> to

  case class ModuleId(organisation: String,
                      name: String,
                      version: String) {
    def idString: String = organisation + ":" + name + ":" + version
  }

  case class Module(id: ModuleId,
                    license: Option[String] = None,
                    extraInfo: String = "",
                    evictedByVersion: Option[String] = None,
                    jarFile: Option[File] = None,
                    error: Option[String] = None) {
    def hadError: Boolean = error.isDefined

    def isUsed: Boolean = !isEvicted

    def isEvicted: Boolean = evictedByVersion.isDefined
  }

  case class ModuleGraph(nodes: Seq[Module], edges: Seq[Edge]) {
    lazy val modules: Map[ModuleId, Module] =
      nodes.map(n ⇒ (n.id, n)).toMap

    def module(id: ModuleId): Module = modules(id)

    lazy val dependencyMap: Map[ModuleId, Seq[Module]] =
      createMap(identity)

    lazy val reverseDependencyMap: Map[ModuleId, Seq[Module]] =
      createMap { case (a, b) ⇒ (b, a) }

    def createMap(bindingFor: ((ModuleId, ModuleId)) ⇒ (ModuleId, ModuleId)): Map[ModuleId, Seq[Module]] = {
      val m = new HashMap[ModuleId, Set[Module]] with MultiMap[ModuleId, Module]
      edges.foreach { entry ⇒
        val (f, t) = bindingFor(entry)
        m.addBinding(f, module(t))
      }
      m.toMap.mapValues(_.toSeq.sortBy(_.id.idString)).withDefaultValue(Nil)
    }

    def roots: Seq[Module] =
      nodes.filter(n ⇒ !edges.exists(_._2 == n.id)).sortBy(_.id.idString)

    def isEmpty: Boolean = nodes.isEmpty
  }

}

Source File: Checksum.scala From schedoscope with Apache License 2.0

5 votes

package org.schedoscope.dsl.transformations

import java.security.MessageDigest

import org.apache.hadoop.fs.{FileStatus, Path}
import org.schedoscope.Schedoscope
import org.schedoscope.scheduler.driver.FilesystemDriver._

import scala.Array.canBuildFrom
import scala.collection.mutable.HashMap

object Checksum {
  private def md5 = MessageDigest.getInstance("MD5")

  private def listFiles(path: String): Array[FileStatus] = {
    val files = fileSystem(path, Schedoscope.settings.hadoopConf).globStatus(new Path(path))
    if (files != null)
      files
    else Array()
  }

  private def fileChecksum(path: String) =
    if (path == null)
      "null-checksum"
    else if (path.endsWith(".jar"))
      path
    else try {
      val cs = fileSystem(path, Schedoscope.settings.hadoopConf).getFileChecksum(new Path(path))
      if (cs == null)
        path
      else
        cs.toString()
    } catch {
      case _: Throwable => path
    }

  def fileChecksums(paths: List[String], recursive: Boolean): List[String] =
    paths.flatMap(path => {
      if (fileSystem(path, Schedoscope.settings.hadoopConf).isFile(new Path(path)))
        List(fileChecksum(path))
      else if (recursive)
        fileChecksums(listFiles(path + "/*").map(f => f.getPath.toString()).toList, recursive)
      else
        List()
    }).sorted

  val resourceHashCache = new HashMap[List[String], List[String]]()

  def resourceHashes(resources: List[String]): List[String] = synchronized {
    resourceHashCache.getOrElseUpdate(resources, fileChecksums(resources, true))
  }

  val defaultDigest = "0"

  def digest(stringsToDigest: String*): String =
    if (stringsToDigest.isEmpty)
      defaultDigest
    else
      md5.digest(stringsToDigest.sorted.mkString.toCharArray().map(_.toByte)).map("%02X" format _).mkString

  object SchemaChecksum {
    val checksumProperty = "schema.checksum"
  }

  object TransformationChecksum {
    val checksumProperty = "transformation.checksum"
    val timestampProperty = "transformation.timestamp"
  }

}

Source File: BackOffSupervision.scala From schedoscope with Apache License 2.0

5 votes

package org.schedoscope.scheduler.utils

import akka.actor.{ActorRef, ActorSystem}
import org.slf4j.LoggerFactory

import scala.collection.mutable.HashMap
import scala.concurrent.duration.{FiniteDuration, _}


  def manageActorLifecycle(managedActor: ActorRef, backOffSlotTime: FiniteDuration = null, backOffMinimumDelay: FiniteDuration = null): FiniteDuration = {
    val managedActorName = managedActor.path.toStringWithoutAddress

    if (actorBackOffWaitTime.contains(managedActorName)) {
      val newBackOff = actorBackOffWaitTime(managedActorName).nextBackOff
      actorBackOffWaitTime.put(managedActorName, newBackOff)
      log.warn(s"$managerName: Set new back-off waiting " +
        s"time to value ${newBackOff.backOffWaitTime} for rebooted actor ${managedActorName}; " +
        s"(retries=${newBackOff.retries}, resets=${newBackOff.resets}, total-retries=${newBackOff.totalRetries})")

      //schedule tick response based on backoff
      newBackOff.backOffWaitTime
    } else {
      val backOff = ExponentialBackOff(backOffSlotTime = backOffSlotTime, constantDelay = backOffMinimumDelay)
      log.debug(s"$managerName: Set initial back-off waiting " +
        s"time to value ${backOff.backOffWaitTime} for booted actor ${managedActorName}; " +
        s"(retries=${backOff.retries}, resets=${backOff.resets}, total-retries=${backOff.totalRetries})")
      actorBackOffWaitTime.put(managedActorName, backOff)

      //schedule immediate tick response
      0 millis
    }
  }

}

Source File: Database.scala From schedoscope with Apache License 2.0

5 votes

package org.schedoscope.test

import java.sql.{Connection, ResultSet, Statement}

import org.schedoscope.dsl.{FieldLike, View}
import org.schedoscope.schema.ddl.HiveQl

import scala.collection.mutable.{HashMap, ListBuffer}

class Database(conn: Connection, url: String) {

  def selectForViewByQuery(v: View, query: String, orderByField: Option[FieldLike[_]]): List[Map[String, Any]] = {
    val res = ListBuffer[Map[String, Any]]()
    var statement: Statement = null
    var rs: ResultSet = null

    try {
      statement = conn.createStatement()
      rs = statement.executeQuery(query)

      while (rs.next()) {
        val row = HashMap[String, Any]()
        v.fields.view.zipWithIndex.foreach(f => {
          row.put(f._1.n, ViewSerDe.deserializeField(f._1.t, rs.getString(f._2 + 1)))
        })
        res.append(row.toMap)
      }
    }
    finally {
      if (rs != null) try {
        rs.close()
      } catch {
        case _: Throwable =>
      }

      if (statement != null) try {
        statement.close()
      } catch {
        case _: Throwable =>
      }
    }

    orderByField match {
      case Some(f) => res.sortBy {
        _ (f.n) match {
          case null => ""
          case other => other.toString
        }
      } toList
      case None => res.toList
    }
  }

  def selectView(v: View, orderByField: Option[FieldLike[_]]): List[Map[String, Any]] =
    selectForViewByQuery(v, HiveQl.selectAll(v), orderByField)

}

Source File: AETest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.unsupervised.dl.autoencoder

import org.scalaml.{Logging, Resource}
import org.scalaml.Predef.DblVec
import org.scalaml.trading.GoogleFinancials.close
import org.scalaml.workflow.data.DataSource
import org.scalatest.{FlatSpec, Matchers}


final class AETest extends FlatSpec with Matchers with Logging with Resource {
  protected val name: String = "Auto-Encoder"

  it should s"$name single hidden layer" in {
    show( "Single hidden layer")

    val REL_PATH = "unsupervised/ae/"
    val ALPHA = 0.8
    val ETA = 0.05
    val NUM_EPOCHS = 2500
    val EPS = 1e-6
    val THRESHOLD = 0.25
    val LAMBDA = 0.18
    val BETA = 0.3

    val symbols = Array[String](
      "FXE", "FXA", "SPY", "GLD", "FXB", "FXF", "FXC", "FXY", "CYB"
    )

    val STUDIES = List[Array[String]](
      Array[String]("FXY", "FXC", "GLD", "FXA"),
      Array[String]("FXE", "FXF", "FXB", "CYB"),
      Array[String]("FXE", "FXC", "GLD", "FXA", "FXY", "FXB"),
      Array[String]("FXC", "FXY", "FXA"),
      Array[String]("CYB", "GLD", "FXY"),
      symbols
    )

    def index: Map[String, Int] =  {
      import scala.collection.mutable.HashMap
      symbols.zipWithIndex./:(HashMap[String, Int]())((mp, si)  => mp += ((si._1, si._2))).toMap
    }

    val path: String = getPath(REL_PATH).getOrElse(".")
    val prices = symbols.map(s => DataSource(s"$path$s.csv", true, true, 1))
      .map( _.flatMap(_.get(close))).filter(_.isSuccess).map(_.get)

    val config = AEConfig(ALPHA, ETA, LAMBDA, BETA, NUM_EPOCHS, EPS)
    val obs = symbols.flatMap( index.get(_)).map(prices(_).toArray)

    val xv = obs.tail.transpose.dropRight(1)

    val ae = AE(config, 8, xv.toVector)

    ae.model match {
      case Some(aeModel) =>
        if(aeModel.synapses.nonEmpty) {
          val inputSynapse = aeModel.synapses.head
          show(s"$name output synapse(0)(0) ${inputSynapse(0)(0)}")
          show(s"$name output synapse(0)(1) ${inputSynapse(0)(1)}")
          show(s"$name output synapse(1)(0) ${inputSynapse(1)(0)}")
          show(s"$name output synapse(1)(1) ${inputSynapse(1)(1)}")
        }
        else
          fail(s"$name Model weights with improper size")
      case None => fail(s"$name could not generate a model")
    }
  }
}

// ---------------------------------  EOF ----------------------------------------------------------------------------

Source File: ParallelismTest.scala From Scala-for-Machine-Learning-Second-Edition with MIT License

5 votes

package org.scalaml.scalability.scala

import org.scalaml.Logging
import org.scalatest.{FlatSpec, Matchers}


final class ParallelismTest extends FlatSpec with Matchers with Logging {
  import scala.collection.mutable.HashMap
  import scala.collection.parallel.mutable.{ParArray, ParHashMap}
  import scala.util.Random

  protected[this] val name: String = "Scala parallel collections"

  final private val SZ = 100000
  final private val NUM_TASKS = 8
  final private val evalRange = Range(1, NUM_TASKS)
  final private val TIMES = 20

  // Arbitrary map function
  final val mapF = (x: Double) => Math.sin(x * 0.01) + Math.exp(-x)

  // Arbitrary filter function
  final val filterF = (x: Double) => x > 0.8

  // Arbitrary reduce function
  final val reduceF = (x: Double, y: Double) => (x + y) * x


  it should s"$name: arrays" in {
    show(s"Evaluation of arrays")

    // Generate random vector for both the non-parallel and parallel array
    val data = Array.fill(SZ)(Random.nextDouble)
    val pData = ParArray.fill(SZ)(Random.nextDouble)

    // Initialized and execute the benchmark for the parallel array
    val benchmark = new ParallelArray[Double](data, pData, TIMES)

    val ratios = new Array[Double](NUM_TASKS)
    evalRange.foreach(n => ratios.update(n, benchmark.map(mapF)(n)))
    val resultMap = ratios.tail
    resultMap.sum / resultMap.size < 1.0 should be(true)
    display(resultMap, "ParArray.map")

    evalRange.foreach(n => ratios.update(n, benchmark.filter(filterF)(n)))
    val resultfilter = ratios.tail
    resultfilter.sum / resultfilter.size < 1.0 should be(true)
    display(resultfilter, "ParArray.filter")
  }

  it should s"$name: maps" in {
    show("Evaluation of maps")

    val mapData = new HashMap[Int, Double]
    Range(0, SZ).foreach(n => mapData.put(n, Random.nextDouble))
    val parMapData = new ParHashMap[Int, Double]
    Range(0, SZ).foreach(n => parMapData.put(n, Random.nextDouble))

    // Initialized and execute the benchmark for the parallel map
    val benchmark = new ParallelMap[Double](mapData.toMap, parMapData, TIMES)

    val ratios = new Array[Double](NUM_TASKS)
    evalRange.foreach(n => ratios.update(n, benchmark.map(mapF)(n)))
    val resultMap = ratios.tail
    resultMap.sum / resultMap.size < 1.0 should be(true)
    display(resultMap, "ParMap.map")

    evalRange.foreach(n => ratios.update(n, benchmark.filter(filterF)(n)))
    val resultfilter = ratios.tail
    resultfilter.sum / resultfilter.size < 1.0 should be(true)
  }


  private def display(x: Array[Double], label: String): Unit = {
    import org.scalaml.plots.{Legend, LightPlotTheme, LinePlot}

    val labels = Legend(
      name,
      "Scala parallel collections",
      s"Scala parallel computation for $label",
      "Relative timing"
    )
    LinePlot.display(x.toVector, labels, new LightPlotTheme)
  }
}

// -------------------------------------------  EOF --------------------------------------------------

Source File: OrderedClustering.scala From nn_coref with GNU General Public License v3.0

5 votes

package edu.berkeley.nlp.coref
import scala.collection.mutable.HashMap
import scala.collection.JavaConverters._
import scala.collection.mutable.ArrayBuffer

class OrderedClustering(val clusters: Seq[Seq[Int]]) {
  // Elements must be consecutive integers from 0 up to n
  private val allIndicesSorted = clusters.foldLeft(new ArrayBuffer[Int])(_ ++ _).sorted; 
  require(allIndicesSorted.sameElements((0 until allIndicesSorted.size).toSeq), allIndicesSorted);
  private val mentionToClusterMap = new HashMap[Int,Seq[Int]];
  for (cluster <- clusters) {
    for (i <- cluster) {
      mentionToClusterMap.put(i, cluster);
    }
  }
  
  def getCluster(idx: Int) = mentionToClusterMap(idx);
  
  def isSingleton(idx: Int) = mentionToClusterMap(idx).size == 1;
  
  def startsCluster(idx: Int) = mentionToClusterMap(idx)(0) == idx;
  
  def areInSameCluster(idx1: Int, idx2: Int) = mentionToClusterMap(idx1).contains(idx2);
  
  def getImmediateAntecedent(idx: Int) = {
    val cluster = mentionToClusterMap(idx);
    val mentIdxInCluster = cluster.indexOf(idx);
    if (mentIdxInCluster == 0) {
      -1
    } else {
      cluster(mentIdxInCluster - 1);
    }
  }
  
  def getAllAntecedents(idx: Int) = {
    val cluster = mentionToClusterMap(idx);
    cluster.slice(0, cluster.indexOf(idx));
  }
  
  def getAllConsequents(idx: Int) = {
    val cluster = mentionToClusterMap(idx);
    cluster.slice(cluster.indexOf(idx) + 1, cluster.size);
  }
  
  
  // Needed for output printing
  def getClusterIdx(idx: Int) = {
    var clusterIdx = 0;
    for (i <- 0 until clusters.size) {
      if (clusters(i).sameElements(mentionToClusterMap(idx))) {
        clusterIdx = i;
      }
    }
    clusterIdx;
  }
  
  def getSubclustering(mentIdxsToKeep: Seq[Int]): OrderedClustering = {
    val oldIndicesToNewIndicesMap = new HashMap[Int,Int]();
    (0 until mentIdxsToKeep.size).map(i => oldIndicesToNewIndicesMap.put(mentIdxsToKeep(i), i));
    val filteredConvertedClusters = clusters.map(cluster => cluster.filter(mentIdxsToKeep.contains(_)).map(mentIdx => oldIndicesToNewIndicesMap(mentIdx)));
    val filteredConvertedClustersNoEmpties = filteredConvertedClusters.filter(cluster => !cluster.isEmpty); 
    new OrderedClustering(filteredConvertedClustersNoEmpties);
  }
}

object OrderedClustering {
  
  def createFromClusterIds(clusterIds: Seq[Int]) = {
    val mentIdAndClusterId = (0 until clusterIds.size).map(i => (i, clusterIds(i)));
    val clustersUnsorted = mentIdAndClusterId.groupBy(_._2).values;
    val finalClusters = clustersUnsorted.toSeq.sortBy(_.head).map(clusterWithClusterId => clusterWithClusterId.map(_._1));
    new OrderedClustering(finalClusters.toSeq);
  }
  
  def createFromBackpointers(backpointers: Seq[Int]) = {
    var nextClusterID = 0;
    val clusters = new ArrayBuffer[ArrayBuffer[Int]]();
    val mentionToCluster = new HashMap[Int,ArrayBuffer[Int]]();
    for (i <- 0 until backpointers.size) {
      if (backpointers(i) == i) {
        val cluster = ArrayBuffer(i);
        clusters += cluster;
        mentionToCluster.put(i, cluster); 
      } else {
        val cluster = mentionToCluster(backpointers(i));
        cluster += i;
        mentionToCluster.put(i, cluster);
      }
    }
    new OrderedClustering(clusters);
  }
}

Source File: FeatureIndexer.scala From jigg with Apache License 2.0

5 votes

package jigg.ml


@SerialVersionUID(1L)
class HashedFeatureIndexer[Feature] private(
  val maxFeatureSize: Int,
  val hasher: (Feature => Int)) extends FeatureIndexer[Feature] {

  def size = maxFeatureSize

  def getIndex(key: Feature) = (math.abs(hasher(key)) % maxFeatureSize)
}

object HashedFeatureIndexer {
  def apply[Feature](
    maxFeatureSize: Int = (2 << 23),
    hasher: (Feature => Int) = {f: Feature => f.hashCode()}) = {

    val biggestPrimeBelow = primes.takeWhile(maxFeatureSize > _).last
    new HashedFeatureIndexer[Feature](biggestPrimeBelow, hasher)
  }

  private lazy val primes = 2 #:: sieve(3)

  private def sieve(n: Int): Stream[Int] =
    if (primes.takeWhile(p => p*p <= n).exists(n % _ == 0)) sieve(n + 2)
    else n #:: sieve(n + 2)
}

Source File: OutputCategoryList.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg



import java.io.FileWriter

import scala.collection.mutable.ArrayBuffer
import scala.sys.process.Process
import scala.collection.mutable.HashMap

import lexicon._

import breeze.config.CommandLineParser

object OutputCategoryList {

  case class Params(
    bank: Opts.BankInfo,
    dict: Opts.DictParams
  )

  case class CategoryInfo(sentence: GoldSuperTaggedSentence, position: Int, num: Int = 1) {
    def increment(): CategoryInfo = this.copy(num = num + 1)
    def replace(_sentence: GoldSuperTaggedSentence, _p: Int) =
      CategoryInfo(_sentence, _p, num + 1)
  }

  def main(args:Array[String]) = {

    val params = CommandLineParser.readIn[Params](args)

    val dict = new JapaneseDictionary(params.dict.categoryDictinoary)
    val bank = CCGBank.select(params.bank, dict)

    val trainSentences: Array[GoldSuperTaggedSentence] = bank.trainSentences

    val stats = new HashMap[Category, CategoryInfo]

    trainSentences foreach { sentence =>
      (0 until sentence.size) foreach { i =>
        val cat = sentence.cat(i)
        stats.get(cat) match {
          case Some(info) =>
            if (sentence.size > info.sentence.size)
              stats += ((cat, info.replace(sentence, i)))
            else
              stats += ((cat, info.increment()))
          case None => stats += ((cat, CategoryInfo(sentence, i)))
          case _ =>
        }
      }
    }
    def highlight(sentence: Sentence, i: Int) = {
      val tokens = sentence.wordSeq
      // tokens.take(i).mkString("") + s"\\x1b[1;31m{${tokens(i)}}\\x1b[0m" + tokens.drop(i+1).mkString("")
      tokens.slice(i-5, i).mkString("") + s"[01;31m${tokens(i)}[00m" + tokens.slice(i+1, i+6).mkString("")
    }

    var fw = new FileWriter("./category.lst")
    stats.toSeq.sortBy(_._2.num).reverse.foreach {
      case (cat, CategoryInfo(sentence, i, num)) =>
        fw.write("%s\t%s\t%s\t%s\n"
          .format(num, cat, sentence.pos(i), highlight(sentence, i)))
    }
    fw.flush
    fw.close

    val noFeatureCategories = new HashMap[String, CategoryInfo]
    stats foreach { case (cat, CategoryInfo(sentence, i, numWithFeat)) =>
      val noFeature = cat.toStringNoFeature
      noFeatureCategories.get(noFeature) match {
        case Some(exist) =>
          val newNum = numWithFeat + exist.num
          val newInfo = exist.copy(num = newNum)
          noFeatureCategories += (noFeature -> newInfo)
        case None =>
          noFeatureCategories += (noFeature -> CategoryInfo(sentence, i, numWithFeat))
        case _ =>
      }
    }

    fw = new FileWriter("./category.nofeature.lst")
    noFeatureCategories.toSeq.sortBy(_._2.num).reverse.foreach {
      case (cat, CategoryInfo(sentence, i, num)) =>
        fw.write("%s\t%s\t%s\t%s\n"
          .format(num, cat, sentence.pos(i), highlight(sentence, i)))
    }
    fw.flush
    fw.close
  }
}

Source File: HeadFinder.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg.parser



import scala.collection.mutable.HashMap
import jigg.nlp.ccg.lexicon.{PoS, JapanesePoS, Category}
import jigg.nlp.ccg.lexicon.Direction._

trait HeadFinder extends Serializable {
  type NodeInfo = HeadFinder.NodeInfo
  def get(left:NodeInfo, right:NodeInfo): Direction
}
object HeadFinder {
  case class NodeInfo(pos:PoS, category:Category, headCategory:Category)
}

case class EnglishHeadFinder(children2dir: Map[(Int, Int), Direction]) extends HeadFinder {
  def get(left:NodeInfo, right:NodeInfo) =
    children2dir.get(left.category.id, right.category.id) match {
      case Some(dir) => dir
      case _ => Left
    }
}

object EnglishHeadFinder {
  import jigg.nlp.ccg.lexicon.{ParseTree, NodeLabel, BinaryTree, NonterminalLabel}
  def createFromParseTrees(trees: Seq[ParseTree[NodeLabel]]): EnglishHeadFinder = {
    val map = new HashMap[(Int, Int), Direction]
    trees.foreach { _.foreachTree { _ match {
      case BinaryTree(left, right, NonterminalLabel(dir, _, _)) =>
        map += (left.label.category.id, right.label.category.id) -> dir
      case _ =>
    }}}
    EnglishHeadFinder(map.toMap)
  }
}

object JapaneseHeadFinder extends HeadFinder {
  val Symbol = "記号"
  def get(left:NodeInfo, right:NodeInfo) = {
    val leftPos = left.pos.first.v
    val rightPos = right.pos.first.v
    if (rightPos == Symbol) Left else Right
  }
}

Source File: Rule.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg.parser



import jigg.nlp.ccg.lexicon.{Category, Derivation, Point, UnaryChildPoint, BinaryChildrenPoints, AppliedRule}

import scala.collection.mutable.{HashMap, HashSet}
import java.io.{ObjectOutputStream, ObjectInputStream}

trait Rule {
  def unify(left:Category, right:Category): Option[Array[(Category, String)]]
  def raise(child:Category): Option[Array[(Category, String)]]
  def headFinder:HeadFinder
}

// rules are restricted to CFG rules extracted from the training CCGBank
case class CFGRule(val binaryRules:Map[(Int,Int), Array[(Category, String)]], // category ids -> (category, ruleType)
                   val unaryRules:Map[Int, Array[(Category, String)]],
                   override val headFinder:HeadFinder) extends Rule {
  def unify(left:Category, right:Category):Option[Array[(Category, String)]] = binaryRules.get((left.id, right.id))
  def raise(child:Category):Option[Array[(Category, String)]] = unaryRules.get(child.id)
}

object CFGRule {
  def extractRulesFromDerivations(derivations: Array[Derivation], headFinder:HeadFinder): CFGRule = {
    val binaryRules = new HashMap[(Int, Int), HashSet[(Category, String)]]
    val unaryRules = new HashMap[Int, HashSet[(Category, String)]]

    derivations.foreach { deriv =>
      deriv.foreachPoint({ point:Point => deriv.get(point) match {
        case Some(AppliedRule(UnaryChildPoint(child), ruleType)) =>
          val parents = unaryRules.getOrElseUpdate(child.category.id, new HashSet[(Category, String)])
          parents += ((point.category, ruleType))
        case Some(AppliedRule(BinaryChildrenPoints(left, right), ruleType)) =>
          val parents = binaryRules.getOrElseUpdate((left.category.id, right.category.id), new HashSet[(Category, String)])
          parents += ((point.category, ruleType))
        case _ =>
      }})
    }
    new CFGRule(binaryRules.map { case (k, v) => k -> v.toArray }.toMap,
                unaryRules.map { case (k, v) => k -> v.toArray }.toMap,
                headFinder)
  }
}

Source File: SuperTaggerModel.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg



import tagger.{LF=>Feature, MaxEntMultiTagger, MaxEntMultiTaggerTrainer, FeatureExtractors}
import lexicon._
import jigg.ml._

import scala.collection.mutable.HashMap

case class SuperTaggerModel(
  dict: Dictionary,
  featureMap: HashMap[Feature, Int],
  weights: WeightVec,
  extractors: FeatureExtractors) { self =>

  def reduceFeatures(): SuperTaggerModel = {

    val buffer = weights.asInstanceOf[GrowableWeightVector[Float]].array // 0 1.0 2.0 0 0 1.0 ...
    val activeIdxs = buffer.zipWithIndex filter (_._1 != 0) map (_._2)  // 1 2 5
    println(s"# features reduced from ${buffer.size} to ${activeIdxs.size}")
    val idxMap = activeIdxs.zipWithIndex.toMap // {1->0, 2->1 5->2}

    val newFeatureMap = featureMap collect {
      case (f, oldIdx) if idxMap.isDefinedAt(oldIdx) => (f, idxMap(oldIdx))
    }
    val newWeights = new FixedWeightVector[Float](activeIdxs.map(buffer).toArray)

    this copy (featureMap = newFeatureMap, weights = newWeights)
  }

  def mkMultiTaggerTrainer(classifierTrainer: OnlineLogLinearTrainer[Int]) =
    new MaxEntMultiTaggerTrainer(mkIndexer(), extractors, classifierTrainer, dict)

  def mkMultiTagger() =
    new MaxEntMultiTagger(mkIndexer(), extractors, mkClassifier(), dict)

  def mkClassifier() = new LogLinearClassifier[Int] {
    override val weights = self.weights
  }

  private def mkIndexer() = new ExactFeatureIndexer(featureMap)
}

object SuperTaggerModel {

  def saveTo(path: String, model: SuperTaggerModel) = {
    System.err.println("Saving tagger model to " + path)
    val os = jigg.util.IOUtil.openBinOut(path)
    os.writeObject(model)
    os.close
  }

  def loadFrom(path: String): SuperTaggerModel = {
    jigg.util.LogUtil.track("Loading supertagger model ...") {
      val in = jigg.util.IOUtil.openBinIn(path)
      val model = in.readObject.asInstanceOf[SuperTaggerModel]
      in.close
      model
    }
  }
}

Source File: CategoryManager.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg.lexicon



import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer

class CategoryManager extends StringBaseNumberedManager[Category] with OptionReturner[Category] {
  override def createWithId(original:Category): Category = original match {
    case AtomicCategory(id, base, avm) => AtomicCategory(newId, base, avm)
    case ComplexCategory(id, left, right, slash) =>
      val leftWithId = assignID(left)
      val rightWithId = assignID(right)
      ComplexCategory(newId, leftWithId, rightWithId, slash)
  }
  override def getOrNone(str:String): Option[Category] = str2objIndex.get(str) match {
    case Some(i) => Some(objects(i))
    case None => canonicalMap.get(createCanonicalInstance(str))
  }

  override def createCanonicalInstance(str:String): Category = JapaneseCategoryParser.parse(str)

  // This is used when candidate shift category is empty
  // It sometimes happen if for example, PoS not registered in the dictionary is detected.
  val unkCategory = getOrCreate("UNK")
}

Source File: CategoryDictionary.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg.lexicon


import scala.collection.mutable.HashMap

@SerialVersionUID(1L)
sealed trait CategoryDictionary extends Serializable {
  type Key
  type UnkKey
  val categoryMap = new HashMap[Key, Array[Category]]
  val unkCategoryMap = new HashMap[UnkKey, Array[Category]]

  def key(word:Word, pos:PoS):Key
  def unkKey(pos:PoS):UnkKey

  def getCandidates(word:Word, pos:PoS):Array[Category] = categoryMap.get(key(word, pos)) match {
    case Some(categories) => categories
    case None => unkCategoryMap.get(unkKey(pos)) match {
      case Some(categories) => categories
      case None => Array[Category]()
    }
  }
  def registCandidates(word:Word, pos:PoS, candidates:Array[Category]) = key(word, pos) match {
    case k => categoryMap += (k -> (categoryMap.get(k) match {
      case Some(alreadyExist) => (candidates ++ alreadyExist).distinct
      case None => candidates.distinct
    }))
  }
  def registUnkCandiates(pos:PoS, candidates:Array[Category]) = unkKey(pos) match {
    case k => unkCategoryMap += (k -> (unkCategoryMap.get(k) match {
      case Some(alreadyExist) => (candidates ++ alreadyExist).distinct
      case None => candidates.distinct
    }))
  }
  def resetWithSentences(sentences: Seq[GoldSuperTaggedSentence], unkThreathold: Int) = {
    val counts = new HashMap[Key, Int]
    sentences foreach { sentence => (0 until sentence.size) foreach { i =>
      val k = key(sentence.base(i), sentence.pos(i))
      counts.getOrElseUpdate(k, 0)
      counts(k) += 1
    }}

    sentences foreach { sentence => (0 until sentence.size) foreach { i =>
      val k = key(sentence.base(i), sentence.pos(i))
      if (counts(k) >= unkThreathold) registCandidates(sentence.base(i), sentence.pos(i), Array(sentence.cat(i)))
      registUnkCandiates(sentence.pos(i), Array(sentence.cat(i)))
    }}
  }
}

class Word2CategoryDictionary extends CategoryDictionary {
  type Key = Int
  type UnkKey = Int
  override def key(word:Word, pos:PoS) = word.id
  override def unkKey(pos:PoS) = pos.id
}

class WordPoS2CategoryDictionary extends CategoryDictionary {
  type Key = (Int, Int)
  type UnkKey = Int
  override def key(word:Word, pos:PoS) = (word.id, pos.id)
  override def unkKey(pos:PoS) = pos.id
}

class WordSecondFineTag2CategoryDictionary extends CategoryDictionary {
  override type Key = (Int, Int)
  override type UnkKey = Int
  override def key(word:Word, pos:PoS) = (word.id, pos.second.id)
  override def unkKey(pos:PoS) = pos.second.id
}

class WordSecondWithConj2CategoryDictionary extends CategoryDictionary {
  override type Key = (Int, Int)
  override type UnkKey = Int
  override def key(word:Word, pos:PoS) = (word.id, pos.secondWithConj.id)
  override def unkKey(pos:PoS) = pos.secondWithConj.id
}

Source File: CategoryFeature.scala From jigg with Apache License 2.0

5 votes

package jigg.nlp.ccg.lexicon


trait CategoryFeature {
  def kvs: Seq[(String, String)]
  def unify(lhs: CategoryFeature): Boolean = false // TODO: implement
}

@SerialVersionUID(-8236395926230742650L)
case class JPCategoryFeature(values: Seq[String]) extends CategoryFeature {
  import JPCategoryFeature._

  override def kvs = keys zip values
  override def toString = kvs.filter(_._2 != "").map { case (k, v) => k + "=" + v }.mkString(",")
}

object JPCategoryFeature {
  // This is a hard-coded mapping of feature structure of Japanese category.
  private val k2vals = Map(
    "mod" -> Array("adv", "adn", "nm"),
    "form" -> Array("attr", "base", "cont", "hyp", "imp",
      "beg", "stem", "ta", "te", "pre", "r", "neg", "s", "da"),
    "case" -> Array("ga", "o", "ni", "to", "nc", "caus"),
    "fin" -> Array("f", "t"))

  private val keys = k2vals.keys.toSeq
  private val v2keyIdx = {
    val key2idx = keys.zipWithIndex.toMap
    k2vals.flatMap { case (key, vals) =>
      vals.map { v => v -> key2idx(key) }
    }
  }
  val kvpair = """\w+=(\w+)""".r

  def createFromValues(values: Seq[String]) = values match {
    case Seq() => emptyFeature
    case _ =>
      val sortedValues = Array.fill(keys.size)("")
      values.filter(_!="").foreach { value =>
        val v = value match { case kvpair(v) => v; case v => v }

        if (v(0) != 'X')
          v2keyIdx(v) match { case i => sortedValues(i) = v }
      }
      JPCategoryFeature(sortedValues)
  }
  // We cache this because most categories don't have a feature
  private val emptyFeature = JPCategoryFeature(Array.fill(keys.size)(""))
}

case class EnCategoryFeature(values: Seq[String]) extends CategoryFeature {
  override def kvs = values.zipWithIndex.map { case (v, k) => (k.toString, v) }
  override def toString = values.mkString(",")
}

object EnCategoryFeature {
  def createFromValues(values: Seq[String]) = EnCategoryFeature(values.sortWith(_ < _))
}

Source File: LocalKMeans.scala From sparkoscope with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println

Source File: JsonUtils.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import scala.collection.mutable.HashMap
import scala.util.control.NonFatal

import org.apache.kafka.common.TopicPartition
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization


  def partitionOffsets(partitionOffsets: Map[TopicPartition, Long]): String = {
    val result = new HashMap[String, HashMap[Int, Long]]()
    implicit val ordering = new Ordering[TopicPartition] {
      override def compare(x: TopicPartition, y: TopicPartition): Int = {
        Ordering.Tuple2[String, Int].compare((x.topic, x.partition), (y.topic, y.partition))
      }
    }
    val partitions = partitionOffsets.keySet.toSeq.sorted  // sort for more determinism
    partitions.foreach { tp =>
        val off = partitionOffsets(tp)
        val parts = result.getOrElse(tp.topic, new HashMap[Int, Long])
        parts += tp.partition -> off
        result += tp.topic -> parts
    }
    Serialization.write(result)
  }
}

Source File: MasterWebUI.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.master.ui

import scala.collection.mutable.HashMap

import org.eclipse.jetty.servlet.ServletContextHandler

import org.apache.spark.deploy.master.Master
import org.apache.spark.internal.Logging
import org.apache.spark.ui.{SparkUI, WebUI}
import org.apache.spark.ui.JettyUtils._


  def initialize() {
    val masterPage = new MasterPage(this)
    attachPage(new ApplicationPage(this))
    attachPage(masterPage)
    attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static"))
    attachHandler(createRedirectHandler(
      "/app/kill", "/", masterPage.handleAppKillRequest, httpMethods = Set("POST")))
    attachHandler(createRedirectHandler(
      "/driver/kill", "/", masterPage.handleDriverKillRequest, httpMethods = Set("POST")))
  }

  def addProxyTargets(id: String, target: String): Unit = {
    var endTarget = target.stripSuffix("/")
    val handler = createProxyHandler("/proxy/" + id, endTarget)
    attachHandler(handler)
    proxyHandlers(id) = handler
  }

  def removeProxyTargets(id: String): Unit = {
    proxyHandlers.remove(id).foreach(detachHandler)
  }
}

private[master] object MasterWebUI {
  private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR
}

Source File: PoolTable.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import java.net.URLEncoder

import scala.collection.mutable.HashMap
import scala.xml.Node

import org.apache.spark.scheduler.{Schedulable, StageInfo}
import org.apache.spark.ui.UIUtils


private[ui] class PoolTable(pools: Seq[Schedulable], parent: StagesTab) {
  private val listener = parent.progressListener

  def toNodeSeq: Seq[Node] = {
    listener.synchronized {
      poolTable(poolRow, pools)
    }
  }

  private def poolTable(
      makeRow: (Schedulable, HashMap[String, HashMap[Int, StageInfo]]) => Seq[Node],
      rows: Seq[Schedulable]): Seq[Node] = {
    <table class="table table-bordered table-striped table-condensed sortable table-fixed">
      <thead>
        <th>Pool Name</th>
        <th>Minimum Share</th>
        <th>Pool Weight</th>
        <th>Active Stages</th>
        <th>Running Tasks</th>
        <th>SchedulingMode</th>
      </thead>
      <tbody>
        {rows.map(r => makeRow(r, listener.poolToActiveStages))}
      </tbody>
    </table>
  }

  private def poolRow(
      p: Schedulable,
      poolToActiveStages: HashMap[String, HashMap[Int, StageInfo]]): Seq[Node] = {
    val activeStages = poolToActiveStages.get(p.name) match {
      case Some(stages) => stages.size
      case None => 0
    }
    val href = "%s/stages/pool?poolname=%s"
      .format(UIUtils.prependBaseUri(parent.basePath), URLEncoder.encode(p.name, "UTF-8"))
    <tr>
      <td>
        <a href={href}>{p.name}</a>
      </td>
      <td>{p.minShare}</td>
      <td>{p.weight}</td>
      <td>{activeStages}</td>
      <td>{p.runningTasks}</td>
      <td>{p.schedulingMode}</td>
    </tr>
  }
}

Source File: ConfigReader.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.internal.config

import java.util.{Map => JMap}
import java.util.regex.Pattern

import scala.collection.mutable.HashMap
import scala.util.matching.Regex

private object ConfigReader {

  private val REF_RE = "\\$\\{(?:(\\w+?):)?(\\S+?)\\}".r

}


  def substitute(input: String): String = substitute(input, Set())

  private def substitute(input: String, usedRefs: Set[String]): String = {
    if (input != null) {
      ConfigReader.REF_RE.replaceAllIn(input, { m =>
        val prefix = m.group(1)
        val name = m.group(2)
        val ref = if (prefix == null) name else s"$prefix:$name"
        require(!usedRefs.contains(ref), s"Circular reference in $input: $ref")

        val replacement = bindings.get(prefix)
          .flatMap(_.get(name))
          .map { v => substitute(v, usedRefs + ref) }
          .getOrElse(m.matched)
        Regex.quoteReplacement(replacement)
      })
    } else {
      input
    }
  }

}

Source File: StageInfo.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskMetrics: TaskMetrics = null,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskMetrics,
      taskLocalityPreferences)
  }
}

Source File: GroupedCountEvaluator.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}

Source File: MasterWebUISuite.scala From sparkoscope with Apache License 2.0

5 votes

package org.apache.spark.deploy.master.ui

import java.io.DataOutputStream
import java.net.{HttpURLConnection, URL}
import java.nio.charset.StandardCharsets
import java.util.Date

import scala.collection.mutable.HashMap

import org.mockito.Mockito.{mock, times, verify, when}
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
import org.apache.spark.deploy.DeployMessages.{KillDriverResponse, RequestKillDriver}
import org.apache.spark.deploy.DeployTestUtils._
import org.apache.spark.deploy.master._
import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv}


class MasterWebUISuite extends SparkFunSuite with BeforeAndAfterAll {

  val conf = new SparkConf
  val securityMgr = new SecurityManager(conf)
  val rpcEnv = mock(classOf[RpcEnv])
  val master = mock(classOf[Master])
  val masterEndpointRef = mock(classOf[RpcEndpointRef])
  when(master.securityMgr).thenReturn(securityMgr)
  when(master.conf).thenReturn(conf)
  when(master.rpcEnv).thenReturn(rpcEnv)
  when(master.self).thenReturn(masterEndpointRef)
  val masterWebUI = new MasterWebUI(master, 0)

  override def beforeAll() {
    super.beforeAll()
    masterWebUI.bind()
  }

  override def afterAll() {
    masterWebUI.stop()
    super.afterAll()
  }

  test("kill application") {
    val appDesc = createAppDesc()
    // use new start date so it isn't filtered by UI
    val activeApp = new ApplicationInfo(
      new Date().getTime, "app-0", appDesc, new Date(), null, Int.MaxValue)

    when(master.idToApp).thenReturn(HashMap[String, ApplicationInfo]((activeApp.id, activeApp)))

    val url = s"http://localhost:${masterWebUI.boundPort}/app/kill/"
    val body = convPostDataToString(Map(("id", activeApp.id), ("terminate", "true")))
    val conn = sendHttpRequest(url, "POST", body)
    conn.getResponseCode

    // Verify the master was called to remove the active app
    verify(master, times(1)).removeApplication(activeApp, ApplicationState.KILLED)
  }

  test("kill driver") {
    val activeDriverId = "driver-0"
    val url = s"http://localhost:${masterWebUI.boundPort}/driver/kill/"
    val body = convPostDataToString(Map(("id", activeDriverId), ("terminate", "true")))
    val conn = sendHttpRequest(url, "POST", body)
    conn.getResponseCode

    // Verify that master was asked to kill driver with the correct id
    verify(masterEndpointRef, times(1)).ask[KillDriverResponse](RequestKillDriver(activeDriverId))
  }

  private def convPostDataToString(data: Map[String, String]): String = {
    (for ((name, value) <- data) yield s"$name=$value").mkString("&")
  }

  
  private def sendHttpRequest(
      url: String,
      method: String,
      body: String = ""): HttpURLConnection = {
    val conn = new URL(url).openConnection().asInstanceOf[HttpURLConnection]
    conn.setRequestMethod(method)
    if (body.nonEmpty) {
      conn.setDoOutput(true)
      conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded")
      conn.setRequestProperty("Content-Length", Integer.toString(body.length))
      val out = new DataOutputStream(conn.getOutputStream)
      out.write(body.getBytes(StandardCharsets.UTF_8))
      out.close()
    }
    conn
  }
}

Source File: exercise02.scala From scala-for-the-Impatient with MIT License

5 votes

import scala.collection.mutable.{ListBuffer, HashMap}
def mapStrIndex(str:String)={
  var indexMap = new HashMap[Char,ListBuffer[Int]]()
  var i = 0
  str.toCharArray.foreach {
    c =>
      indexMap.get(c) match {
        case Some(result) => result += i
        case None => indexMap += (c -> ListBuffer {
          i
        })
      }
      i += 1
  }
  indexMap

}
println(mapStrIndex("Mississippi"))

Source File: exercise01.scala From scala-for-the-Impatient with MIT License

5 votes

import scala.collection.SortedSet
import scala.collection.mutable.HashMap
def mapStrIndex(str:String)={
  var indexMap = new HashMap[Char,SortedSet[Int]]()
  var i = 0
  str.toCharArray.foreach {
    c =>
      indexMap.get(c) match {
        case Some(result) => indexMap(c) = result + i
        case None => indexMap += (c -> SortedSet {
          i
        })
      }
      i += 1
  }
  indexMap

}
println(mapStrIndex("Mississippi"))

Source File: BlockStoreShuffleFetcher.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.shuffle.hash

import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import scala.util.{Failure, Success, Try}

import org.apache.spark._
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle.FetchFailedException
import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId}
import org.apache.spark.util.CompletionIterator

private[hash] object BlockStoreShuffleFetcher extends Logging {
  def fetch[T](
      shuffleId: Int,
      reduceId: Int,
      context: TaskContext,
      serializer: Serializer)
    : Iterator[T] =
  {
    logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId))
    val blockManager = SparkEnv.get.blockManager

    val startTime = System.currentTimeMillis
    val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId)
    logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format(
      shuffleId, reduceId, System.currentTimeMillis - startTime))

    val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]]
    for (((address, size), index) <- statuses.zipWithIndex) {
      splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size))
    }

    val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map {
      case (address, splits) =>
        (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2)))
    }

    def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = {
      val blockId = blockPair._1
      val blockOption = blockPair._2
      blockOption match {
        case Success(block) => {
          block.asInstanceOf[Iterator[T]]
        }
        case Failure(e) => {
          blockId match {
            case ShuffleBlockId(shufId, mapId, _) =>
              val address = statuses(mapId.toInt)._1
              throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e)
            case _ =>
              throw new SparkException(
                "Failed to get block " + blockId + ", which is not a shuffle block", e)
          }
        }
      }
    }

    val blockFetcherItr = new ShuffleBlockFetcherIterator(
      context,
      SparkEnv.get.blockManager.shuffleClient,
      blockManager,
      blocksByAddress,
      serializer,
      SparkEnv.get.conf.getLong("spark.reducer.maxMbInFlight", 48) * 1024 * 1024)
    val itr = blockFetcherItr.flatMap(unpackBlock)

    val completionIter = CompletionIterator[T, Iterator[T]](itr, {
      context.taskMetrics.updateShuffleReadMetrics()
    })

    new InterruptibleIterator[T](context, completionIter) {
      val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency()
      override def next(): T = {
        readMetrics.incRecordsRead(1)
        delegate.next()
      }
    }
  }
}

Source File: ExecutorsTab.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.ui.exec

import scala.collection.mutable.HashMap

import org.apache.spark.ExceptionFailure
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.storage.StorageStatusListener
import org.apache.spark.ui.{SparkUI, SparkUITab}

private[ui] class ExecutorsTab(parent: SparkUI) extends SparkUITab(parent, "executors") {
  val listener = parent.executorsListener
  val sc = parent.sc
  val threadDumpEnabled =
    sc.isDefined && parent.conf.getBoolean("spark.ui.threadDumpsEnabled", true)

  attachPage(new ExecutorsPage(this, threadDumpEnabled))
  if (threadDumpEnabled) {
    attachPage(new ExecutorThreadDumpPage(this))
  }
}


@DeveloperApi
class ExecutorsListener(storageStatusListener: StorageStatusListener) extends SparkListener {
  val executorToTasksActive = HashMap[String, Int]()
  val executorToTasksComplete = HashMap[String, Int]()
  val executorToTasksFailed = HashMap[String, Int]()
  val executorToDuration = HashMap[String, Long]()
  val executorToInputBytes = HashMap[String, Long]()
  val executorToInputRecords = HashMap[String, Long]()
  val executorToOutputBytes = HashMap[String, Long]()
  val executorToOutputRecords = HashMap[String, Long]()
  val executorToShuffleRead = HashMap[String, Long]()
  val executorToShuffleWrite = HashMap[String, Long]()
  val executorToLogUrls = HashMap[String, Map[String, String]]()

  def storageStatusList = storageStatusListener.storageStatusList

  override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded) = synchronized {
    val eid = executorAdded.executorId
    executorToLogUrls(eid) = executorAdded.executorInfo.logUrlMap
  }

  override def onTaskStart(taskStart: SparkListenerTaskStart) = synchronized {
    val eid = taskStart.taskInfo.executorId
    executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 0) + 1
  }

  override def onTaskEnd(taskEnd: SparkListenerTaskEnd) = synchronized {
    val info = taskEnd.taskInfo
    if (info != null) {
      val eid = info.executorId
      executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 1) - 1
      executorToDuration(eid) = executorToDuration.getOrElse(eid, 0L) + info.duration
      taskEnd.reason match {
        case e: ExceptionFailure =>
          executorToTasksFailed(eid) = executorToTasksFailed.getOrElse(eid, 0) + 1
        case _ =>
          executorToTasksComplete(eid) = executorToTasksComplete.getOrElse(eid, 0) + 1
      }

      // Update shuffle read/write
      val metrics = taskEnd.taskMetrics
      if (metrics != null) {
        metrics.inputMetrics.foreach { inputMetrics =>
          executorToInputBytes(eid) =
            executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead
          executorToInputRecords(eid) =
            executorToInputRecords.getOrElse(eid, 0L) + inputMetrics.recordsRead
        }
        metrics.outputMetrics.foreach { outputMetrics =>
          executorToOutputBytes(eid) =
            executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten
          executorToOutputRecords(eid) =
            executorToOutputRecords.getOrElse(eid, 0L) + outputMetrics.recordsWritten
        }
        metrics.shuffleReadMetrics.foreach { shuffleRead =>
          executorToShuffleRead(eid) =
            executorToShuffleRead.getOrElse(eid, 0L) + shuffleRead.remoteBytesRead
        }
        metrics.shuffleWriteMetrics.foreach { shuffleWrite =>
          executorToShuffleWrite(eid) =
            executorToShuffleWrite.getOrElse(eid, 0L) + shuffleWrite.shuffleBytesWritten
        }
      }
    }
  }

}

Source File: UIData.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import org.apache.spark.JobExecutionStatus
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
import org.apache.spark.util.collection.OpenHashSet

import scala.collection.mutable.HashMap

private[jobs] object UIData {

  class ExecutorSummary {
    var taskTime : Long = 0
    var failedTasks : Int = 0
    var succeededTasks : Int = 0
    var inputBytes : Long = 0
    var inputRecords : Long = 0
    var outputBytes : Long = 0
    var outputRecords : Long = 0
    var shuffleRead : Long = 0
    var shuffleReadRecords : Long = 0
    var shuffleWrite : Long = 0
    var shuffleWriteRecords : Long = 0
    var memoryBytesSpilled : Long = 0
    var diskBytesSpilled : Long = 0
  }

  class JobUIData(
    var jobId: Int = -1,
    var submissionTime: Option[Long] = None,
    var completionTime: Option[Long] = None,
    var stageIds: Seq[Int] = Seq.empty,
    var jobGroup: Option[String] = None,
    var status: JobExecutionStatus = JobExecutionStatus.UNKNOWN,
    
  case class TaskUIData(
      var taskInfo: TaskInfo,
      var taskMetrics: Option[TaskMetrics] = None,
      var errorMessage: Option[String] = None)
}

Source File: PoolTable.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import scala.collection.mutable.HashMap
import scala.xml.Node

import org.apache.spark.scheduler.{Schedulable, StageInfo}
import org.apache.spark.ui.UIUtils


private[ui] class PoolTable(pools: Seq[Schedulable], parent: StagesTab) {
  private val listener = parent.listener

  def toNodeSeq: Seq[Node] = {
    listener.synchronized {
      poolTable(poolRow, pools)
    }
  }

  private def poolTable(
      makeRow: (Schedulable, HashMap[String, HashMap[Int, StageInfo]]) => Seq[Node],
      rows: Seq[Schedulable]): Seq[Node] = {
    <table class="table table-bordered table-striped table-condensed sortable table-fixed">
      <thead>
        <th>Pool Name</th>
        <th>Minimum Share</th>
        <th>Pool Weight</th>
        <th>Active Stages</th>
        <th>Running Tasks</th>
        <th>SchedulingMode</th>
      </thead>
      <tbody>
        {rows.map(r => makeRow(r, listener.poolToActiveStages))}
      </tbody>
    </table>
  }

  private def poolRow(
      p: Schedulable,
      poolToActiveStages: HashMap[String, HashMap[Int, StageInfo]]): Seq[Node] = {
    val activeStages = poolToActiveStages.get(p.name) match {
      case Some(stages) => stages.size
      case None => 0
    }
    val href = "%s/stages/pool?poolname=%s"
      .format(UIUtils.prependBaseUri(parent.basePath), p.name)
    <tr>
      <td>
        <a href={href}>{p.name}</a>
      </td>
      <td>{p.minShare}</td>
      <td>{p.weight}</td>
      <td>{activeStages}</td>
      <td>{p.runningTasks}</td>
      <td>{p.schedulingMode}</td>
    </tr>
  }
}

Source File: StageInfo.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.RDDInfo


  def fromStage(stage: Stage, numTasks: Option[Int] = None): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      stage.attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.details)
  }
}

Source File: GroupedSumEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedCountEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T,Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T,Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T,Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedMeanEvaluator.scala From SparkCore with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: filter_errorcode.scala From scalabpe with Apache License 2.0

5 votes

package scalabpe.plugin

import scala.collection.mutable.HashMap
import scala.xml.Node

import scalabpe.core.DummyActor
import scalabpe.core.HashMapStringAny
import scalabpe.core.Logging
import scalabpe.core.Request
import scalabpe.core.Response
import scalabpe.core.ResponseFilter
import scalabpe.core.Router

class ErrorCodeDefine(val resultCodeName: String, val resultMsgName: String);

class ErrorDescResponseFilter(val router: Router, val cfgNode: Node) extends ResponseFilter with Logging {

    val cfgs = new HashMap[Int, ErrorCodeDefine]()
    var localCacheServiceId = 0
    val dummyActor = new DummyActor()

    init

    def init() {

        var s = (cfgNode \ "@localCacheServiceId").toString
        if (s != "")
            localCacheServiceId = s.toInt

        val serviceNodes = (cfgNode \ "Service")
        for (p <- serviceNodes) {

            val serviceId = (p \ "@serviceId").toString.toInt
            val resultCodeName = (p \ "@resultCodeField").toString
            val resultMsgName = (p \ "@resultMsgField").toString

            cfgs.put(serviceId, new ErrorCodeDefine(resultCodeName, resultMsgName))

            // log.info("serviceId=%d,resultCodeName=%s,resultMsgName=%s".format(serviceId,resultCodeName,resultMsgName))
        }

        log.info("errorcode response filter created")
    }

    def filter(res: Response, req: Request): Unit = {

        // log.info("error response filter called, res={}",res.toString)

        val rd = cfgs.getOrElse(res.serviceId, null)
        if (rd == null) return

        if (rd.resultCodeName != "") {
            if (res.body.getOrElse(rd.resultCodeName, null) == null) {
                res.body.put(rd.resultCodeName, res.code)
            }
        }

        if (res.code == 0) return
        if (rd.resultMsgName == "") return
        if (res.body.getOrElse(rd.resultMsgName, null) != null) return

        val body = new HashMapStringAny()
        body.put("resultCode", res.code)

        val req = new Request(
            res.requestId + ":$",
            Router.DO_NOT_REPLY,
            res.sequence,
            res.encoding,
            localCacheServiceId,
            1,
            new HashMapStringAny(),
            body,
            dummyActor)

        val invokeResult = router.send(req)
        if (invokeResult == null) return

        val resultMsg = invokeResult.s("resultMsg", "")
        if (resultMsg != "")
            res.body.put(rd.resultMsgName, resultMsg)
    }

}

Source File: httpserverplugin_staticfile.scala From scalabpe with Apache License 2.0

5 votes

package scalabpe.plugin.http

import java.io.File
import java.net.URLEncoder
import java.text.SimpleDateFormat
import java.util.Calendar
import java.util.GregorianCalendar
import java.util.Locale
import java.util.TimeZone

import scala.collection.mutable.HashMap

import org.jboss.netty.handler.codec.http.HttpHeaders

import scalabpe.core.HashMapStringAny

class StaticFilePlugin extends HttpServerPlugin with HttpServerStaticFilePlugin {

    val ETAG_TAG = "etag"
    val EXPIRE_TAG = "expire"
    val ATTACHMENT = "attachment"
    val FILENAME = "filename"

    val HTTP_DATE_FORMAT = "EEE, dd MMM yyyy HH:mm:ss zzz";
    val HTTP_DATE_GMT_TIMEZONE = "GMT";

    val df_tl = new ThreadLocal[SimpleDateFormat]() {
        override def initialValue(): SimpleDateFormat = {
            val df = new SimpleDateFormat(HTTP_DATE_FORMAT, Locale.US)
            df.setTimeZone(TimeZone.getTimeZone(HTTP_DATE_GMT_TIMEZONE));
            df
        }
    }

    def generateStaticFile(serviceId: Int, msgId: Int, errorCode: Int, errorMessage: String, body: HashMapStringAny, pluginParam: String, headers: HashMap[String, String]): String = {

        if (body.ns(FILENAME) == "") {
            return null
        }

        val filename = body.ns(FILENAME)
        if (!new File(filename).exists()) {
            return null
        }
        if (body.ns(ETAG_TAG) != "") {
            headers.put("ETag", body.ns(ETAG_TAG))
        }

        if (body.ns(EXPIRE_TAG) != "") {
            body.i(EXPIRE_TAG) match {
                case 0 | -1 =>
                    headers.put(HttpHeaders.Names.CACHE_CONTROL, "no-cache")
                case n => // seconds
                    val time = new GregorianCalendar();
                    time.add(Calendar.SECOND, n);
                    headers.put(HttpHeaders.Names.EXPIRES, df_tl.get.format(time.getTime()));
                    headers.put(HttpHeaders.Names.CACHE_CONTROL, "max-age=" + n);
            }
        }

        val ext = parseExt(filename)
        if (ext != "")
            body.put("__file_ext__", ext)

        if (body.ns(ATTACHMENT, "1") == "1") {
            val filename = body.ns(FILENAME)
            val v = "attachment; filename=\"%s\"".format(URLEncoder.encode(parseFilename(filename), "UTF-8"))
            headers.put("Content-Disposition", v)
        }

        filename
    }

    def parseFilename(name: String): String = {
        val p = name.lastIndexOf("/")
        if (p < 0) return name
        name.substring(p + 1)
    }
    def parseExt(name: String): String = {
        val p = name.lastIndexOf(".")
        if (p < 0) return ""
        name.substring(p + 1).toLowerCase()
    }

}

Source File: format_flow.scala From scalabpe with Apache License 2.0

5 votes

package scalabpe

import java.io._
import scala.collection.mutable.HashMap
import scala.collection.mutable.ArrayBuffer
import scala.io.Source
import org.apache.commons.io.FileUtils
import scala.xml._
import scala.collection.mutable._
import scalabpe.core._
import org.apache.commons.lang.StringUtils
import Tools._



object FormatFlowTool {

    def help() {
        println(
"""
usage: scalabpe.FormatFlowTool [options] dirname
options:
    -h|--help               帮助信息
""")
    }

    def parseArgs(args:Array[String]):HashMapStringAny = {
        val map = HashMapStringAny()
        var i = 0
        val files = ArrayBufferString()
        while(i < args.size) {
            args(i) match {
                case "-h" | "--help" => 
                    return null
                case s if s.startsWith("-") => 
                    println("invalid option "+s)
                    return null
                case _ => 
                    files += args(i)
                    i += 1
            }
        }
        map.put("files",files)
        map
    }

    def main(args:Array[String]) {

        var params = parseArgs(args)
        if( params == null ) {
            help()
            return
        }
        var files = params.nls("files")
        if( files.size == 0 ) {
            help()
            return
        }

        var dir = files(0)
        if( !new File(dir).exists() ) {
            val p1 = "compose_conf"+File.separator+dir
            if( new File(p1).exists ) {
                dir = p1
            } else {
                println("not a valid dir, dir="+dir)
                return
            }
        }

        processDir(dir,params)
    }

    def processDir(dir:String,params:HashMapStringAny) {
        val files = new File(dir).listFiles.filter(_.getName.endsWith(".flow"))
        for(f <- files ) {
            processFile(dir,f.getName,params)
        }
    }

    def processFile(dir:String,f:String,params:HashMapStringAny) {
        val lines = readAllLines(dir+File.separator+f)
        // TODO
    }

}

Source File: GraphMap.scala From stellar-random-walk with Apache License 2.0

5 votes

package au.csiro.data61.randomwalk.algorithm

import scala.collection.mutable
import scala.collection.mutable.{ArrayBuffer, HashMap}



  def reset {
    indexCounter = 0
    offsetCounter = 0
    srcVertexMap.clear()
    offsets.clear()
    lengths.clear()
    edges.clear()
    vertexPartitionMap.clear
  }

  def getNeighbors(vid: Int): Array[(Int, Float)] = {
    srcVertexMap.get(vid) match {
      case Some(index) =>
        if (index == -1) {
          return Array.empty[(Int, Float)]
        }
        val offset = offsets(index)
        val length = lengths(index)
        edges.slice(offset, offset + length).toArray
      case None => null
    }
  }
}

Source File: HeaderEnum.scala From testchipip with BSD 3-Clause "New" or "Revised" License

5 votes

package testchipip

import chisel3._
import chisel3.util.log2Up
import scala.collection.mutable.{HashMap, ListBuffer}

class HeaderEnum(val prefix: String) {
  val h = new HashMap[String,Int]
  def makeHeader(): String = {
    h.toSeq.sortBy(_._2).map { case (n,i) => s"#define ${prefix.toUpperCase}_${n.toUpperCase} $i\n" } mkString
  }
  def apply(s: String): UInt = h(s).U(log2Up(h.size).W)
}

object HeaderEnum {
  val contents = new ListBuffer[String]

  def apply(prefix: String, names: String*): HeaderEnum = {
    val e = new HeaderEnum(prefix)
    names.zipWithIndex.foreach { case (n,i) => e.h.put(n,i) }
    val header = e.makeHeader()
    if(!HeaderEnum.contents.contains(header)) HeaderEnum.contents += header
    e
  }
}

Source File: Mapper.scala From CSYE7200_Old with MIT License

5 votes

package edu.neu.coe.csye7200.mapreduce

import akka.actor.{Actor, ActorLogging, ActorRef}

import scala.collection.mutable
import scala.collection.mutable.HashMap
import scala.util._


class Mapper_Forgiving[K1,V1,K2,V2](f: (K1,V1)=>(K2,V2)) extends Mapper[K1,V1,K2,V2](f) {
  
  override def prepareReply(v2k2ts: Seq[Try[(K2,V2)]]) = {
      val v2sK2m = mutable.HashMap[K2,Seq[V2]]() // mutable
      val xs = Seq[Throwable]() // mutable
      for (v2k2t <- v2k2ts; v2k2e = Master.sequence(v2k2t))
        v2k2e match {
          case Right((k2,v2)) => v2sK2m put(k2, v2+:v2sK2m.getOrElse((k2), (Nil)))
          case Left(x) => xs :+ x
      }
      (v2sK2m.toMap, xs)
  }
}

case class Incoming[K, V](m: Seq[(K,V)]) {
  override def toString = s"Incoming: with ${m.size} elements"
}

object Incoming {
  def sequence[K,V](vs: Seq[V]): Incoming[K,V] = Incoming((vs zip Stream.continually(null.asInstanceOf[K])).map{_.swap})
  def map[K, V](vKm: Map[K,V]): Incoming[K,V] = Incoming(vKm.toSeq)
}

object Mapper {
}

Source File: Labels.scala From jgo with GNU General Public License v3.0

5 votes

package jgo.tools.compiler
package parser.stmts

import parser.exprs._
import parser.scoped._
import parser.funcs._

import interm._
import types._
import symbol._
import codeseq._
import instr._

import scala.collection.mutable.{HashMap, HashSet, ListBuffer}
import scala.{collection => coll}
import coll.{immutable => imm}


trait Labels {
  private val seenDefs   = HashSet[String]()
  private val unseenDefs = HashMap[String, ListBuffer[Pos]]()
  private val lbls = HashMap[String, UserLabel]()
  
  def defLabel(name: String, pos: Pos): (String, Err[UserLabel]) =
    if (seenDefs contains name)
      (name, problem("label %s already defined", name)(pos))
    else {
      seenDefs += name
      unseenDefs -= name
      val label = lbls getOrElseUpdate (name, new UserLabel(name))
      (name, result(label))
    }
  
  def useLabel(pos: Pos, name: String): UserLabel = {
    if (!(seenDefs contains name))
      unseenDefs.getOrElseUpdate(name, new ListBuffer) += pos
    lbls getOrElseUpdate (name, new UserLabel(name))
  }
  
  def procGoto(pos: Pos, name: String): Err[CodeBuilder] = {
    result(Goto(useLabel(pos, name)))
  }
  
  def checkForUndefedLabels: Err[Unit] = {
    var issues: Err[Unit] = result(())
    for ((lblName, positions) <- unseenDefs; pos <- positions) {
      issues = issues then problem("target label not found: %s", lblName)(pos)
    }
    issues
  }
}

Source File: LocalKMeans.scala From multi-tenancy-spark with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println

Source File: JsonUtils.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import scala.collection.mutable.HashMap
import scala.util.control.NonFatal

import org.apache.kafka.common.TopicPartition
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization


  def partitionOffsets(partitionOffsets: Map[TopicPartition, Long]): String = {
    val result = new HashMap[String, HashMap[Int, Long]]()
    implicit val ordering = new Ordering[TopicPartition] {
      override def compare(x: TopicPartition, y: TopicPartition): Int = {
        Ordering.Tuple2[String, Int].compare((x.topic, x.partition), (y.topic, y.partition))
      }
    }
    val partitions = partitionOffsets.keySet.toSeq.sorted  // sort for more determinism
    partitions.foreach { tp =>
        val off = partitionOffsets(tp)
        val parts = result.getOrElse(tp.topic, new HashMap[Int, Long])
        parts += tp.partition -> off
        result += tp.topic -> parts
    }
    Serialization.write(result)
  }
}

Source File: ThriftServerMonitor.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.sql.hive.thriftserver.monitor

import scala.collection.mutable.HashMap

import org.apache.spark.SparkException
import org.apache.spark.internal.Logging
import org.apache.spark.sql.hive.thriftserver.ui.ThriftServerTab

object ThriftServerMonitor extends Logging {

  private[this] val uiTabs = new HashMap[String, ThriftServerTab]()

  private[this] val listeners = new HashMap[String, ThriftServerListener]()

  def setListener(user: String, sparkListener: ThriftServerListener): Unit = {
    listeners.put(user, sparkListener)
  }

  def getListener(user: String): ThriftServerListener = {
    listeners.getOrElse(user, throw new SparkException(s"Listener does not init for user[$user]"))
  }

  def addUITab(user: String, ui: ThriftServerTab): Unit = {
    uiTabs.put(user, ui)
  }

  def detachUITab(user: String): Unit = {
    listeners.remove(user)
    uiTabs.get(user).foreach(_.detach())
  }

  def detachAllUITabs(): Unit = {
    uiTabs.values.foreach(_.detach())
  }
}

Source File: MasterWebUI.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.master.ui

import scala.collection.mutable.HashMap

import org.eclipse.jetty.servlet.ServletContextHandler

import org.apache.spark.deploy.master.Master
import org.apache.spark.internal.Logging
import org.apache.spark.ui.{SparkUI, WebUI}
import org.apache.spark.ui.JettyUtils._


  def initialize() {
    val masterPage = new MasterPage(this)
    attachPage(new ApplicationPage(this))
    attachPage(masterPage)
    attachHandler(createStaticHandler(MasterWebUI.STATIC_RESOURCE_DIR, "/static"))
    attachHandler(createRedirectHandler(
      "/app/kill", "/", masterPage.handleAppKillRequest, httpMethods = Set("POST")))
    attachHandler(createRedirectHandler(
      "/driver/kill", "/", masterPage.handleDriverKillRequest, httpMethods = Set("POST")))
  }

  def addProxyTargets(id: String, target: String): Unit = {
    var endTarget = target.stripSuffix("/")
    val handler = createProxyHandler("/proxy/" + id, endTarget)
    attachHandler(handler)
    proxyHandlers(id) = handler
  }

  def removeProxyTargets(id: String): Unit = {
    proxyHandlers.remove(id).foreach(detachHandler)
  }
}

private[master] object MasterWebUI {
  private val STATIC_RESOURCE_DIR = SparkUI.STATIC_RESOURCE_DIR
}

Source File: PoolTable.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import java.net.URLEncoder

import scala.collection.mutable.HashMap
import scala.xml.Node

import org.apache.spark.scheduler.{Schedulable, StageInfo}
import org.apache.spark.ui.UIUtils


private[ui] class PoolTable(pools: Seq[Schedulable], parent: StagesTab) {
  private val listener = parent.progressListener

  def toNodeSeq: Seq[Node] = {
    listener.synchronized {
      poolTable(poolRow, pools)
    }
  }

  private def poolTable(
      makeRow: (Schedulable, HashMap[String, HashMap[Int, StageInfo]]) => Seq[Node],
      rows: Seq[Schedulable]): Seq[Node] = {
    <table class="table table-bordered table-striped table-condensed sortable table-fixed">
      <thead>
        <th>Pool Name</th>
        <th>Minimum Share</th>
        <th>Pool Weight</th>
        <th>Active Stages</th>
        <th>Running Tasks</th>
        <th>SchedulingMode</th>
      </thead>
      <tbody>
        {rows.map(r => makeRow(r, listener.poolToActiveStages))}
      </tbody>
    </table>
  }

  private def poolRow(
      p: Schedulable,
      poolToActiveStages: HashMap[String, HashMap[Int, StageInfo]]): Seq[Node] = {
    val activeStages = poolToActiveStages.get(p.name) match {
      case Some(stages) => stages.size
      case None => 0
    }
    val href = "%s/stages/pool?poolname=%s"
      .format(UIUtils.prependBaseUri(parent.basePath, sparkUser = parent.sparkUser), URLEncoder.encode(p.name, "UTF-8"))
    <tr>
      <td>
        <a href={href}>{p.name}</a>
      </td>
      <td>{p.minShare}</td>
      <td>{p.weight}</td>
      <td>{activeStages}</td>
      <td>{p.runningTasks}</td>
      <td>{p.schedulingMode}</td>
    </tr>
  }
}

Source File: ConfigReader.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.internal.config

import java.util.{Map => JMap}
import java.util.regex.Pattern

import scala.collection.mutable.HashMap
import scala.util.matching.Regex

private object ConfigReader {

  private val REF_RE = "\\$\\{(?:(\\w+?):)?(\\S+?)\\}".r

}


  def substitute(input: String): String = substitute(input, Set())

  private def substitute(input: String, usedRefs: Set[String]): String = {
    if (input != null) {
      ConfigReader.REF_RE.replaceAllIn(input, { m =>
        val prefix = m.group(1)
        val name = m.group(2)
        val ref = if (prefix == null) name else s"$prefix:$name"
        require(!usedRefs.contains(ref), s"Circular reference in $input: $ref")

        val replacement = bindings.get(prefix)
          .flatMap(_.get(name))
          .map { v => substitute(v, usedRefs + ref) }
          .getOrElse(m.matched)
        Regex.quoteReplacement(replacement)
      })
    } else {
      input
    }
  }

}

Source File: StageInfo.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskMetrics: TaskMetrics = null,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskMetrics,
      taskLocalityPreferences)
  }
}

Source File: GroupedCountEvaluator.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}

Source File: MasterWebUISuite.scala From multi-tenancy-spark with Apache License 2.0

5 votes

package org.apache.spark.deploy.master.ui

import java.io.DataOutputStream
import java.net.{HttpURLConnection, URL}
import java.nio.charset.StandardCharsets
import java.util.Date

import scala.collection.mutable.HashMap

import org.mockito.Mockito.{mock, times, verify, when}
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
import org.apache.spark.deploy.DeployMessages.{KillDriverResponse, RequestKillDriver}
import org.apache.spark.deploy.DeployTestUtils._
import org.apache.spark.deploy.master._
import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv}


class MasterWebUISuite extends SparkFunSuite with BeforeAndAfterAll {

  val conf = new SparkConf
  val securityMgr = new SecurityManager(conf)
  val rpcEnv = mock(classOf[RpcEnv])
  val master = mock(classOf[Master])
  val masterEndpointRef = mock(classOf[RpcEndpointRef])
  when(master.securityMgr).thenReturn(securityMgr)
  when(master.conf).thenReturn(conf)
  when(master.rpcEnv).thenReturn(rpcEnv)
  when(master.self).thenReturn(masterEndpointRef)
  val masterWebUI = new MasterWebUI(master, 0)

  override def beforeAll() {
    super.beforeAll()
    masterWebUI.bind()
  }

  override def afterAll() {
    masterWebUI.stop()
    super.afterAll()
  }

  test("kill application") {
    val appDesc = createAppDesc()
    // use new start date so it isn't filtered by UI
    val activeApp = new ApplicationInfo(
      new Date().getTime, "app-0", appDesc, new Date(), null, Int.MaxValue)

    when(master.idToApp).thenReturn(HashMap[String, ApplicationInfo]((activeApp.id, activeApp)))

    val url = s"http://localhost:${masterWebUI.boundPort}/app/kill/"
    val body = convPostDataToString(Map(("id", activeApp.id), ("terminate", "true")))
    val conn = sendHttpRequest(url, "POST", body)
    conn.getResponseCode

    // Verify the master was called to remove the active app
    verify(master, times(1)).removeApplication(activeApp, ApplicationState.KILLED)
  }

  test("kill driver") {
    val activeDriverId = "driver-0"
    val url = s"http://localhost:${masterWebUI.boundPort}/driver/kill/"
    val body = convPostDataToString(Map(("id", activeDriverId), ("terminate", "true")))
    val conn = sendHttpRequest(url, "POST", body)
    conn.getResponseCode

    // Verify that master was asked to kill driver with the correct id
    verify(masterEndpointRef, times(1)).ask[KillDriverResponse](RequestKillDriver(activeDriverId))
  }

  private def convPostDataToString(data: Map[String, String]): String = {
    (for ((name, value) <- data) yield s"$name=$value").mkString("&")
  }

  
  private def sendHttpRequest(
      url: String,
      method: String,
      body: String = ""): HttpURLConnection = {
    val conn = new URL(url).openConnection().asInstanceOf[HttpURLConnection]
    conn.setRequestMethod(method)
    if (body.nonEmpty) {
      conn.setDoOutput(true)
      conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded")
      conn.setRequestProperty("Content-Length", Integer.toString(body.length))
      val out = new DataOutputStream(conn.getOutputStream)
      out.write(body.getBytes(StandardCharsets.UTF_8))
      out.close()
    }
    conn
  }
}

Source File: LocalKMeans.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}

Source File: BlockStoreShuffleFetcher.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.shuffle.hash

import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import scala.util.{Failure, Success, Try}

import org.apache.spark._
import org.apache.spark.serializer.Serializer
import org.apache.spark.shuffle.FetchFailedException
import org.apache.spark.storage.{BlockId, BlockManagerId, ShuffleBlockFetcherIterator, ShuffleBlockId}
import org.apache.spark.util.CompletionIterator

private[hash] object BlockStoreShuffleFetcher extends Logging {
  def fetch[T](
      shuffleId: Int,
      reduceId: Int,
      context: TaskContext,
      serializer: Serializer)
    : Iterator[T] =
  {
    logDebug("Fetching outputs for shuffle %d, reduce %d".format(shuffleId, reduceId))
    val blockManager = SparkEnv.get.blockManager

    val startTime = System.currentTimeMillis
    val statuses = SparkEnv.get.mapOutputTracker.getServerStatuses(shuffleId, reduceId)
    logDebug("Fetching map output location for shuffle %d, reduce %d took %d ms".format(
      shuffleId, reduceId, System.currentTimeMillis - startTime))

    val splitsByAddress = new HashMap[BlockManagerId, ArrayBuffer[(Int, Long)]]
    for (((address, size), index) <- statuses.zipWithIndex) {
      splitsByAddress.getOrElseUpdate(address, ArrayBuffer()) += ((index, size))
    }

    val blocksByAddress: Seq[(BlockManagerId, Seq[(BlockId, Long)])] = splitsByAddress.toSeq.map {
      case (address, splits) =>
        (address, splits.map(s => (ShuffleBlockId(shuffleId, s._1, reduceId), s._2)))
    }

    def unpackBlock(blockPair: (BlockId, Try[Iterator[Any]])) : Iterator[T] = {
      val blockId = blockPair._1
      val blockOption = blockPair._2
      blockOption match {
        case Success(block) => {
          block.asInstanceOf[Iterator[T]]
        }
        case Failure(e) => {
          blockId match {
            case ShuffleBlockId(shufId, mapId, _) =>
              val address = statuses(mapId.toInt)._1
              throw new FetchFailedException(address, shufId.toInt, mapId.toInt, reduceId, e)
            case _ =>
              throw new SparkException(
                "Failed to get block " + blockId + ", which is not a shuffle block", e)
          }
        }
      }
    }

    val blockFetcherItr = new ShuffleBlockFetcherIterator(
      context,
      SparkEnv.get.blockManager.shuffleClient,
      blockManager,
      blocksByAddress,
      serializer,
      // Note: we use getSizeAsMb when no suffix is provided for backwards compatibility
      SparkEnv.get.conf.getSizeAsMb("spark.reducer.maxSizeInFlight", "48m") * 1024 * 1024)
    val itr = blockFetcherItr.flatMap(unpackBlock)

    val completionIter = CompletionIterator[T, Iterator[T]](itr, {
      context.taskMetrics.updateShuffleReadMetrics()
    })

    new InterruptibleIterator[T](context, completionIter) {
      val readMetrics = context.taskMetrics.createShuffleReadMetricsForDependency()
      override def next(): T = {
        readMetrics.incRecordsRead(1)
        delegate.next()
      }
    }
  }
}

Source File: ExecutorsTab.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ui.exec

import scala.collection.mutable.HashMap

import org.apache.spark.ExceptionFailure
import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.scheduler._
import org.apache.spark.storage.{StorageStatus, StorageStatusListener}
import org.apache.spark.ui.{SparkUI, SparkUITab}
import org.apache.spark.ui.jobs.UIData.ExecutorUIData

private[ui] class ExecutorsTab(parent: SparkUI) extends SparkUITab(parent, "executors") {
  val listener = parent.executorsListener
  val sc = parent.sc
  val threadDumpEnabled =
    sc.isDefined && parent.conf.getBoolean("spark.ui.threadDumpsEnabled", true)

  attachPage(new ExecutorsPage(this, threadDumpEnabled))
  if (threadDumpEnabled) {
    attachPage(new ExecutorThreadDumpPage(this))
  }
}


@DeveloperApi
class ExecutorsListener(storageStatusListener: StorageStatusListener) extends SparkListener {
  val executorToTasksActive = HashMap[String, Int]()
  val executorToTasksComplete = HashMap[String, Int]()
  val executorToTasksFailed = HashMap[String, Int]()
  val executorToDuration = HashMap[String, Long]()
  val executorToInputBytes = HashMap[String, Long]()
  val executorToInputRecords = HashMap[String, Long]()
  val executorToOutputBytes = HashMap[String, Long]()
  val executorToOutputRecords = HashMap[String, Long]()
  val executorToShuffleRead = HashMap[String, Long]()
  val executorToShuffleWrite = HashMap[String, Long]()
  val executorToLogUrls = HashMap[String, Map[String, String]]()
  val executorIdToData = HashMap[String, ExecutorUIData]()

  def storageStatusList: Seq[StorageStatus] = storageStatusListener.storageStatusList

  override def onExecutorAdded(executorAdded: SparkListenerExecutorAdded): Unit = synchronized {
    val eid = executorAdded.executorId
    executorToLogUrls(eid) = executorAdded.executorInfo.logUrlMap
    executorIdToData(eid) = ExecutorUIData(executorAdded.time)
  }

  override def onExecutorRemoved(
      executorRemoved: SparkListenerExecutorRemoved): Unit = synchronized {
    val eid = executorRemoved.executorId
    val uiData = executorIdToData(eid)
    uiData.finishTime = Some(executorRemoved.time)
    uiData.finishReason = Some(executorRemoved.reason)
  }

  override def onTaskStart(taskStart: SparkListenerTaskStart): Unit = synchronized {
    val eid = taskStart.taskInfo.executorId
    executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 0) + 1
  }

  override def onTaskEnd(taskEnd: SparkListenerTaskEnd): Unit = synchronized {
    val info = taskEnd.taskInfo
    if (info != null) {
      val eid = info.executorId
      executorToTasksActive(eid) = executorToTasksActive.getOrElse(eid, 1) - 1
      executorToDuration(eid) = executorToDuration.getOrElse(eid, 0L) + info.duration
      taskEnd.reason match {
        case e: ExceptionFailure =>
          executorToTasksFailed(eid) = executorToTasksFailed.getOrElse(eid, 0) + 1
        case _ =>
          executorToTasksComplete(eid) = executorToTasksComplete.getOrElse(eid, 0) + 1
      }

      // Update shuffle read/write
      val metrics = taskEnd.taskMetrics
      if (metrics != null) {
        metrics.inputMetrics.foreach { inputMetrics =>
          executorToInputBytes(eid) =
            executorToInputBytes.getOrElse(eid, 0L) + inputMetrics.bytesRead
          executorToInputRecords(eid) =
            executorToInputRecords.getOrElse(eid, 0L) + inputMetrics.recordsRead
        }
        metrics.outputMetrics.foreach { outputMetrics =>
          executorToOutputBytes(eid) =
            executorToOutputBytes.getOrElse(eid, 0L) + outputMetrics.bytesWritten
          executorToOutputRecords(eid) =
            executorToOutputRecords.getOrElse(eid, 0L) + outputMetrics.recordsWritten
        }
        metrics.shuffleReadMetrics.foreach { shuffleRead =>
          executorToShuffleRead(eid) =
            executorToShuffleRead.getOrElse(eid, 0L) + shuffleRead.remoteBytesRead
        }
        metrics.shuffleWriteMetrics.foreach { shuffleWrite =>
          executorToShuffleWrite(eid) =
            executorToShuffleWrite.getOrElse(eid, 0L) + shuffleWrite.shuffleBytesWritten
        }
      }
    }
  }

}

Source File: UIData.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import org.apache.spark.JobExecutionStatus
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
import org.apache.spark.util.collection.OpenHashSet

import scala.collection.mutable.HashMap

private[spark] object UIData {

  class ExecutorSummary {
    var taskTime : Long = 0
    var failedTasks : Int = 0
    var succeededTasks : Int = 0
    var inputBytes : Long = 0
    var inputRecords : Long = 0
    var outputBytes : Long = 0
    var outputRecords : Long = 0
    var shuffleRead : Long = 0
    var shuffleReadRecords : Long = 0
    var shuffleWrite : Long = 0
    var shuffleWriteRecords : Long = 0
    var memoryBytesSpilled : Long = 0
    var diskBytesSpilled : Long = 0
  }

  class JobUIData(
    var jobId: Int = -1,
    var submissionTime: Option[Long] = None,
    var completionTime: Option[Long] = None,
    var stageIds: Seq[Int] = Seq.empty,
    var jobGroup: Option[String] = None,
    var status: JobExecutionStatus = JobExecutionStatus.UNKNOWN,
    
  case class TaskUIData(
      var taskInfo: TaskInfo,
      var taskMetrics: Option[TaskMetrics] = None,
      var errorMessage: Option[String] = None)

  case class ExecutorUIData(
      val startTime: Long,
      var finishTime: Option[Long] = None,
      var finishReason: Option[String] = None)
}

Source File: PoolTable.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import scala.collection.mutable.HashMap
import scala.xml.Node

import org.apache.spark.scheduler.{Schedulable, StageInfo}
import org.apache.spark.ui.UIUtils


private[ui] class PoolTable(pools: Seq[Schedulable], parent: StagesTab) {
  private val listener = parent.progressListener

  def toNodeSeq: Seq[Node] = {
    listener.synchronized {
      poolTable(poolRow, pools)
    }
  }

  private def poolTable(
      makeRow: (Schedulable, HashMap[String, HashMap[Int, StageInfo]]) => Seq[Node],
      rows: Seq[Schedulable]): Seq[Node] = {
    <table class="table table-bordered table-striped table-condensed sortable table-fixed">
      <thead>
        <th>Pool Name</th>
        <th>Minimum Share</th>
        <th>Pool Weight</th>
        <th>Active Stages</th>
        <th>Running Tasks</th>
        <th>SchedulingMode</th>
      </thead>
      <tbody>
        {rows.map(r => makeRow(r, listener.poolToActiveStages))}
      </tbody>
    </table>
  }

  private def poolRow(
      p: Schedulable,
      poolToActiveStages: HashMap[String, HashMap[Int, StageInfo]]): Seq[Node] = {
    val activeStages = poolToActiveStages.get(p.name) match {
      case Some(stages) => stages.size
      case None => 0
    }
    val href = "%s/stages/pool?poolname=%s"
      .format(UIUtils.prependBaseUri(parent.basePath), p.name)
    <tr>
      <td>
        <a href={href}>{p.name}</a>
      </td>
      <td>{p.minShare}</td>
      <td>{p.weight}</td>
      <td>{activeStages}</td>
      <td>{p.runningTasks}</td>
      <td>{p.schedulingMode}</td>
    </tr>
  }
}

Source File: StageInfo.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.RDDInfo


  def fromStage(stage: Stage, numTasks: Option[Int] = None): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      stage.attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details)
  }
}

Source File: GroupedSumEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedCountEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedMeanEvaluator.scala From iolap with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: FeatureSelection.scala From aerosolve with Apache License 2.0

5 votes

package com.airbnb.aerosolve.training

import java.io.BufferedWriter
import java.io.OutputStreamWriter
import java.util

import com.airbnb.aerosolve.core.{ModelRecord, ModelHeader, FeatureVector, Example}
import com.airbnb.aerosolve.core.models.LinearModel
import com.airbnb.aerosolve.core.util.Util
import com.typesafe.config.Config
import org.slf4j.{LoggerFactory, Logger}
import org.apache.spark.SparkContext
import org.apache.spark.SparkContext._
import org.apache.spark.rdd.RDD

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet
import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.Buffer
import scala.collection.JavaConversions._
import scala.collection.JavaConverters._
import scala.util.Random
import scala.math.abs
import org.apache.hadoop.fs.FileSystem
import org.apache.hadoop.fs.Path
import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.Path

object FeatureSelection {
  private final val log: Logger = LoggerFactory.getLogger("FeatureSelection")
  val allKey : (String, String) = ("$ALL", "$POS")

  // Given a RDD compute the pointwise mutual information between
  // the positive label and the discrete features.
  def pointwiseMutualInformation(examples : RDD[Example],
                                 config : Config,
                                 key : String,
                                 rankKey : String,
                                 posThreshold : Double,
                                 minPosCount : Double,
                                 newCrosses : Boolean) : RDD[((String, String), Double)] = {
    val pointwise = LinearRankerUtils.makePointwise(examples, config, key, rankKey)
    val features = pointwise
      .mapPartitions(part => {
      // The tuple2 is var, var | positive
      val output = scala.collection.mutable.HashMap[(String, String), (Double, Double)]()
      part.foreach(example =>{
        val featureVector = example.example.get(0)
        val isPos = if (featureVector.floatFeatures.get(rankKey).asScala.head._2 > posThreshold) 1.0
        else 0.0
        val all : (Double, Double) = output.getOrElse(allKey, (0.0, 0.0))
        output.put(allKey, (all._1 + 1.0, all._2 + 1.0 * isPos))

        val features : Array[(String, String)] =
          LinearRankerUtils.getFeatures(featureVector)
        if (newCrosses) {
          for (i <- features) {
            for (j <- features) {
              if (i._1 < j._1) {
                val key = ("%s<NEW>%s".format(i._1, j._1),
                           "%s<NEW>%s".format(i._2, j._2))
                val x = output.getOrElse(key, (0.0, 0.0))
                output.put(key, (x._1 + 1.0, x._2 + 1.0 * isPos))
              }
            }
          }
        }
        for (feature <- features) {
          val x = output.getOrElse(feature, (0.0, 0.0))
          output.put(feature, (x._1 + 1.0, x._2 + 1.0 * isPos))
        }
      })
      output.iterator
    })
    .reduceByKey((a, b) => (a._1 + b._1, a._2 + b._2))
    .filter(x => x._2._2 >= minPosCount)

    val allCount = features.filter(x => x._1.equals(allKey)).take(1).head

    features.map(x => {
      val prob = x._2._1 / allCount._2._1
      val probPos = x._2._2 / allCount._2._2
      (x._1, math.log(probPos / prob) / math.log(2.0))
    })
  }

  // Returns the maximum entropy per family
  def maxEntropy(input : RDD[((String, String), Double)]) : RDD[((String, String), Double)] = {
    input
      .map(x => (x._1._1, (x._1._2, x._2)))
      .reduceByKey((a, b) => if (math.abs(a._2) > math.abs(b._2)) a else b)
      .map(x => ((x._1, x._2._1), x._2._2))
  }
}

Source File: package.scala From kyuubi with Apache License 2.0

5 votes

package yaooqinn.kyuubi

import scala.collection.mutable.HashMap

import org.apache.hadoop.fs.permission.FsPermission

package object yarn {

  type EnvMap = HashMap[String, String]

  val KYUUBI_YARN_APP_NAME = "KYUUBI SERVER"
  val KYUUBI_YARN_APP_TYPE = "KYUUBI"
  // Staging directory for any temporary jars or files
  val KYUUBI_STAGING: String = ".kyuubiStaging"
  // Staging directory is private! -> rwx--------
  val STAGING_DIR_PERMISSION: FsPermission =
    FsPermission.createImmutable(Integer.parseInt("700", 8).toShort)
  // App files are world-wide readable and owner writable -> rw-r--r--
  val APP_FILE_PERMISSION: FsPermission =
    FsPermission.createImmutable(Integer.parseInt("644", 8).toShort)

  val SPARK_CONF_DIR = "__spark_conf__"
  val SPARK_CONF_FILE = "__spark_conf__.properties"
  // Subdirectory in the conf directory containing Hadoop config files.
  val HADOOP_CONF_DIR = "__hadoop_conf__"
  // File containing the conf archive in the AM. See prepareLocalResources().
  val SPARK_CONF_ARCHIVE: String = SPARK_CONF_DIR + ".zip"
  val SPARK_LIB_DIR = "__spark_libs__"
  val LOCAL_SCHEME = "local"
}

Source File: KyuubiDistributedCacheManager.scala From kyuubi with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.net.URI

import scala.collection.mutable.{HashMap, Map}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType}


  def addResource(
      fs: FileSystem,
      conf: Configuration,
      destPath: Path,
      localResources: HashMap[String, LocalResource],
      resourceType: LocalResourceType,
      link: String,
      statCache: Map[URI, FileStatus]): Unit = {
    cacheManager.addResource(fs, conf, destPath,
      localResources, resourceType, link, statCache, appMasterOnly = true)
  }
}

Source File: KyuubiDistributedCacheManagerSuite.scala From kyuubi with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import java.net.URI

import scala.collection.mutable.{HashMap, Map}

import org.apache.hadoop.conf.Configuration
import org.apache.hadoop.fs.{FileStatus, FileSystem, Path}
import org.apache.hadoop.yarn.api.records.{LocalResource, LocalResourceType, LocalResourceVisibility}
import org.apache.hadoop.yarn.util.ConverterUtils
import org.apache.spark.{KyuubiSparkUtil, SparkFunSuite}
import org.mockito.Mockito.when
import org.scalatest.mock.MockitoSugar

import yaooqinn.kyuubi.utils.ReflectUtils

class KyuubiDistributedCacheManagerSuite extends SparkFunSuite with MockitoSugar {

  class MockClientDistributedCacheManager extends ClientDistributedCacheManager {
    override def getVisibility(conf: Configuration, uri: URI, statCache: Map[URI, FileStatus]):
    LocalResourceVisibility = {
      LocalResourceVisibility.PRIVATE
    }
  }

  test("add resource") {
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    val status = new FileStatus()
    when(fs.getFileStatus(destPath)).thenReturn(status)
    val fileLink = "link"
    ReflectUtils.setFieldValue(
      KyuubiDistributedCacheManager, "cacheManager", new MockClientDistributedCacheManager)
    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath, localResources, LocalResourceType.FILE, fileLink, statCache)
    val res = localResources(fileLink)
    assert(res.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(res.getResource) === destPath)
    assert(res.getSize === 0)
    assert(res.getTimestamp === 0)
    assert(res.getType === LocalResourceType.FILE)
    val status2 = new FileStatus(
      10, false, 1, 1024, 10,
      10, null, KyuubiSparkUtil.getCurrentUserName, null, new Path("/tmp/testing2"))
    val destPath2 = new Path("file:///foo.bar.com:8080/tmp/testing2")
    when(fs.getFileStatus(destPath2)).thenReturn(status2)
    val fileLink2 = "link2"
    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath2, localResources, LocalResourceType.FILE, fileLink2, statCache)
    val res2 = localResources(fileLink2)
    assert(res2.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(res2.getResource) === destPath2)
    assert(res2.getSize === 10)
    assert(res2.getTimestamp === 10)
    assert(res2.getType === LocalResourceType.FILE)
  }

  test("add resource when link null") {
    val distMgr = new MockClientDistributedCacheManager()
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr)
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    when(fs.getFileStatus(destPath)).thenReturn(new FileStatus())
    intercept[Exception] {
      KyuubiDistributedCacheManager.addResource(
        fs, conf, destPath, localResources, LocalResourceType.FILE, null, statCache)
    }
    assert(localResources.get("link") === None)
    assert(localResources.size === 0)
  }

  test("test addResource archive") {
    val distMgr = new MockClientDistributedCacheManager()
    ReflectUtils.setFieldValue(KyuubiDistributedCacheManager, "cacheManager", distMgr)
    val fs = mock[FileSystem]
    val conf = new Configuration()
    val destPath = new Path("file:///foo.bar.com:8080/tmp/testing")
    val localResources = HashMap[String, LocalResource]()
    val statCache = HashMap[URI, FileStatus]()
    val realFileStatus = new FileStatus(10, false, 1, 1024, 10, 10, null, "testOwner",
      null, new Path("/tmp/testing"))
    when(fs.getFileStatus(destPath)).thenReturn(realFileStatus)

    KyuubiDistributedCacheManager.addResource(
      fs, conf, destPath, localResources, LocalResourceType.ARCHIVE, "link", statCache)
    val resource = localResources("link")
    assert(resource.getVisibility === LocalResourceVisibility.PRIVATE)
    assert(ConverterUtils.getPathFromYarnURL(resource.getResource) === destPath)
    assert(resource.getTimestamp === 10)
    assert(resource.getSize === 10)
    assert(resource.getType === LocalResourceType.ARCHIVE)

  }

}

Source File: UIData.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import org.apache.spark.JobExecutionStatus
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
import org.apache.spark.util.collection.OpenHashSet

import scala.collection.mutable
import scala.collection.mutable.HashMap

private[spark] object UIData {

  class ExecutorSummary {
    var taskTime : Long = 0//任务时间
    var failedTasks : Int = 0//失败任务数
    var succeededTasks : Int = 0//完成任务数
    var inputBytes : Long = 0
    var inputRecords : Long = 0
    var outputBytes : Long = 0
    var outputRecords : Long = 0
    var shuffleRead : Long = 0
    var shuffleReadRecords : Long = 0
    var shuffleWrite : Long = 0
    var shuffleWriteRecords : Long = 0
    var memoryBytesSpilled : Long = 0
    var diskBytesSpilled : Long = 0
  }

  class JobUIData(
    var jobId: Int = -1,
    var submissionTime: Option[Long] = None,//提交时间
    var completionTime: Option[Long] = None,//完成时间
    var stageIds: Seq[Int] = Seq.empty,
    var jobGroup: Option[String] = None,
    var status: JobExecutionStatus = JobExecutionStatus.UNKNOWN,
    
  case class TaskUIData(
      var taskInfo: TaskInfo,
      var taskMetrics: Option[TaskMetrics] = None,
      var errorMessage: Option[String] = None)

  case class ExecutorUIData(
      val startTime: Long,
      var finishTime: Option[Long] = None,
      var finishReason: Option[String] = None)
}

Source File: PoolTable.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import scala.collection.mutable.HashMap
import scala.xml.Node

import org.apache.spark.scheduler.{Schedulable, StageInfo}
import org.apache.spark.ui.UIUtils


private[ui] class PoolTable(pools: Seq[Schedulable], parent: StagesTab) {
  private val listener = parent.progressListener

  def toNodeSeq: Seq[Node] = {
    listener.synchronized {
      poolTable(poolRow, pools)
    }
  }

  private def poolTable(
      makeRow: (Schedulable, HashMap[String, HashMap[Int, StageInfo]]) => Seq[Node],
      rows: Seq[Schedulable]): Seq[Node] = {
    <table class="table table-bordered table-striped table-condensed sortable table-fixed">
      <thead>
        <th>Pool Name</th>
        <th>Minimum Share</th>
        <th>Pool Weight</th>
        <th>Active Stages</th>
        <th>Running Tasks</th>
        <th>SchedulingMode</th>
      </thead>
      <tbody>
        {rows.map(r => makeRow(r, listener.poolToActiveStages))}
      </tbody>
    </table>
  }

  private def poolRow(
      p: Schedulable,
      poolToActiveStages: HashMap[String, HashMap[Int, StageInfo]]): Seq[Node] = {
    val activeStages = poolToActiveStages.get(p.name) match {
      case Some(stages) => stages.size
      case None => 0
    }
    val href = "%s/stages/pool?poolname=%s"
      .format(UIUtils.prependBaseUri(parent.basePath), p.name)
    <tr>
      <td>
        <a href={href}>{p.name}</a>
      </td>
      <td>{p.minShare}</td>
      <td>{p.weight}</td>
      <td>{activeStages}</td>
      <td>{p.runningTasks}</td>
      <td>{p.schedulingMode}</td>
    </tr>
  }
}

Source File: StageInfo.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      //None被声明为一个对象,而不是一个类,在没有值的时候,使用None,如果有值可以引用,就使用Some来包含这个值,都是Option的子类
      numTasks: Option[Int] = None,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    //方法获取RDD的所有直接或间接的NarrowDependency的RDD
    //RDDInfo.fromRdd创建RDDInfo信息,包括RDD父依赖关系
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    //对当前stage的RDD也生成RDDInfo,然后所有生成的RDDInfo合并到rddInfos
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskLocalityPreferences)
  }
}

Source File: GroupedSumEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result(entry.getKey) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result(entry.getKey) = new BoundedDouble(sumEstimate, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedCountEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result(key) = new BoundedDouble(sum, 1.0, sum, sum)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(key) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: GroupedMeanEvaluator.scala From spark1.52 with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConversions.mapAsScalaMap
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result(entry.getKey) = new BoundedDouble(mean, 1.0, mean, mean)
      }
      result
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result(entry.getKey) = new BoundedDouble(mean, confidence, low, high)
      }
      result
    }
  }
}

Source File: Mapper.scala From CSYE7200 with MIT License

5 votes

package edu.neu.coe.csye7200.mapreduce

import akka.actor.{Actor, ActorLogging, ActorRef}

import scala.collection.mutable
import scala.collection.mutable.HashMap
import scala.util._


class Mapper_Forgiving[K1,V1,K2,V2](f: (K1,V1)=>(K2,V2)) extends Mapper[K1,V1,K2,V2](f) {
  
  override def prepareReply(v2k2ts: Seq[Try[(K2,V2)]]): (Map[K2, Seq[V2]], Seq[Throwable]) = {
      val v2sK2m = mutable.HashMap[K2,Seq[V2]]() // mutable
      val xs = Seq[Throwable]() // mutable
    // CONSIDER using traverse
    for (v2k2t <- v2k2ts; v2k2e = Master.sequence(v2k2t))
        v2k2e match {
          case Right((k2,v2)) => v2sK2m put(k2, v2+:v2sK2m.getOrElse(k2, Nil))
          case Left(x) => xs :+ x
      }
      (v2sK2m.toMap, xs)
  }
}

case class Incoming[K, V](m: Seq[(K,V)]) {
  override def toString = s"Incoming: with ${m.size} elements"
}

object Incoming {
  def sequence[K,V](vs: Seq[V]): Incoming[K,V] = Incoming((vs zip Stream.continually(null.asInstanceOf[K])).map{_.swap})
  def map[K, V](vKm: Map[K,V]): Incoming[K,V] = Incoming(vKm.toSeq)
}

object Mapper {
}

Source File: LocalKMeans.scala From Spark-2.3.1 with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{squaredDistance, DenseVector, Vector}


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D) {rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers(i)
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use org.apache.spark.ml.clustering.KMeans
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    val points = new HashSet[Vector[Double]]
    val kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println(s"Initial centers: $kPoints")

    while(tempDist > convergeDist) {
      val closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      val mappings = closest.groupBy[Int] (x => x._1)

      val pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints(mapping._1), mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println(s"Final centers: $kPoints")
  }
}
// scalastyle:on println

Source File: LocalityPlacementStrategySuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.yarn

import scala.collection.JavaConverters._
import scala.collection.mutable.{HashMap, HashSet, Set}

import org.apache.hadoop.yarn.api.records._
import org.apache.hadoop.yarn.conf.YarnConfiguration
import org.mockito.Mockito._

import org.apache.spark.{SparkConf, SparkFunSuite}

class LocalityPlacementStrategySuite extends SparkFunSuite {

  test("handle large number of containers and tasks (SPARK-18750)") {
    // Run the test in a thread with a small stack size, since the original issue
    // surfaced as a StackOverflowError.
    var error: Throwable = null

    val runnable = new Runnable() {
      override def run(): Unit = try {
        runTest()
      } catch {
        case e: Throwable => error = e
      }
    }

    val thread = new Thread(new ThreadGroup("test"), runnable, "test-thread", 32 * 1024)
    thread.start()
    thread.join()

    assert(error === null)
  }

  private def runTest(): Unit = {
    val yarnConf = new YarnConfiguration()

    // The numbers below have been chosen to balance being large enough to replicate the
    // original issue while not taking too long to run when the issue is fixed. The main
    // goal is to create enough requests for localized containers (so there should be many
    // tasks on several hosts that have no allocated containers).

    val resource = Resource.newInstance(8 * 1024, 4)
    val strategy = new LocalityPreferredContainerPlacementStrategy(new SparkConf(),
      yarnConf, resource, new MockResolver())

    val totalTasks = 32 * 1024
    val totalContainers = totalTasks / 16
    val totalHosts = totalContainers / 16

    val mockId = mock(classOf[ContainerId])
    val hosts = (1 to totalHosts).map { i => (s"host_$i", totalTasks % i) }.toMap
    val containers = (1 to totalContainers).map { i => mockId }
    val count = containers.size / hosts.size / 2

    val hostToContainerMap = new HashMap[String, Set[ContainerId]]()
    hosts.keys.take(hosts.size / 2).zipWithIndex.foreach { case (host, i) =>
      val hostContainers = new HashSet[ContainerId]()
      containers.drop(count * i).take(i).foreach { c => hostContainers += c }
      hostToContainerMap(host) = hostContainers
    }

    strategy.localityOfRequestedContainers(containers.size * 2, totalTasks, hosts,
      hostToContainerMap, Nil)
  }

}

Source File: JsonUtils.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.sql.kafka010

import scala.collection.mutable.HashMap
import scala.util.control.NonFatal

import org.apache.kafka.common.TopicPartition
import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization


  def partitionOffsets(partitionOffsets: Map[TopicPartition, Long]): String = {
    val result = new HashMap[String, HashMap[Int, Long]]()
    implicit val ordering = new Ordering[TopicPartition] {
      override def compare(x: TopicPartition, y: TopicPartition): Int = {
        Ordering.Tuple2[String, Int].compare((x.topic, x.partition), (y.topic, y.partition))
      }
    }
    val partitions = partitionOffsets.keySet.toSeq.sorted  // sort for more determinism
    partitions.foreach { tp =>
        val off = partitionOffsets(tp)
        val parts = result.getOrElse(tp.topic, new HashMap[Int, Long])
        parts += tp.partition -> off
        result += tp.topic -> parts
    }
    Serialization.write(result)
  }
}

Source File: StageInfo.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskMetrics: TaskMetrics = null,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskMetrics,
      taskLocalityPreferences)
  }
}

Source File: GroupedCountEvaluator.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.partial

import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  private var outputsMerged = 0
  private val sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]): Unit = {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      sums.map { case (key, sum) => (key, new BoundedDouble(sum, 1.0, sum, sum)) }.toMap
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      sums.map { case (key, sum) => (key, CountEvaluator.bound(confidence, sum, p)) }.toMap
    }
  }
}

Source File: MasterWebUISuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.deploy.master.ui

import java.io.DataOutputStream
import java.net.{HttpURLConnection, URL}
import java.nio.charset.StandardCharsets
import java.util.Date

import scala.collection.mutable.HashMap

import org.mockito.Mockito.{mock, times, verify, when}
import org.scalatest.BeforeAndAfterAll

import org.apache.spark.{SecurityManager, SparkConf, SparkFunSuite}
import org.apache.spark.deploy.DeployMessages.{KillDriverResponse, RequestKillDriver}
import org.apache.spark.deploy.DeployTestUtils._
import org.apache.spark.deploy.master._
import org.apache.spark.rpc.{RpcEndpointRef, RpcEnv}


class MasterWebUISuite extends SparkFunSuite with BeforeAndAfterAll {

  val conf = new SparkConf
  val securityMgr = new SecurityManager(conf)
  val rpcEnv = mock(classOf[RpcEnv])
  val master = mock(classOf[Master])
  val masterEndpointRef = mock(classOf[RpcEndpointRef])
  when(master.securityMgr).thenReturn(securityMgr)
  when(master.conf).thenReturn(conf)
  when(master.rpcEnv).thenReturn(rpcEnv)
  when(master.self).thenReturn(masterEndpointRef)
  val masterWebUI = new MasterWebUI(master, 0)

  override def beforeAll() {
    super.beforeAll()
    masterWebUI.bind()
  }

  override def afterAll() {
    masterWebUI.stop()
    super.afterAll()
  }

  test("kill application") {
    val appDesc = createAppDesc()
    // use new start date so it isn't filtered by UI
    val activeApp = new ApplicationInfo(
      new Date().getTime, "app-0", appDesc, new Date(), null, Int.MaxValue)

    when(master.idToApp).thenReturn(HashMap[String, ApplicationInfo]((activeApp.id, activeApp)))

    val url = s"http://localhost:${masterWebUI.boundPort}/app/kill/"
    val body = convPostDataToString(Map(("id", activeApp.id), ("terminate", "true")))
    val conn = sendHttpRequest(url, "POST", body)
    conn.getResponseCode

    // Verify the master was called to remove the active app
    verify(master, times(1)).removeApplication(activeApp, ApplicationState.KILLED)
  }

  test("kill driver") {
    val activeDriverId = "driver-0"
    val url = s"http://localhost:${masterWebUI.boundPort}/driver/kill/"
    val body = convPostDataToString(Map(("id", activeDriverId), ("terminate", "true")))
    val conn = sendHttpRequest(url, "POST", body)
    conn.getResponseCode

    // Verify that master was asked to kill driver with the correct id
    verify(masterEndpointRef, times(1)).ask[KillDriverResponse](RequestKillDriver(activeDriverId))
  }

  private def convPostDataToString(data: Map[String, String]): String = {
    (for ((name, value) <- data) yield s"$name=$value").mkString("&")
  }

  
  private def sendHttpRequest(
      url: String,
      method: String,
      body: String = ""): HttpURLConnection = {
    val conn = new URL(url).openConnection().asInstanceOf[HttpURLConnection]
    conn.setRequestMethod(method)
    if (body.nonEmpty) {
      conn.setDoOutput(true)
      conn.setRequestProperty("Content-Type", "application/x-www-form-urlencoded")
      conn.setRequestProperty("Content-Length", Integer.toString(body.length))
      val out = new DataOutputStream(conn.getOutputStream)
      out.write(body.getBytes(StandardCharsets.UTF_8))
      out.close()
    }
    conn
  }
}

Source File: TaskDescriptionSuite.scala From Spark-2.3.1 with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import java.io.{ByteArrayOutputStream, DataOutputStream, UTFDataFormatException}
import java.nio.ByteBuffer
import java.util.Properties

import scala.collection.mutable.HashMap

import org.apache.spark.SparkFunSuite

class TaskDescriptionSuite extends SparkFunSuite {
  test("encoding and then decoding a TaskDescription results in the same TaskDescription") {
    val originalFiles = new HashMap[String, Long]()
    originalFiles.put("fileUrl1", 1824)
    originalFiles.put("fileUrl2", 2)

    val originalJars = new HashMap[String, Long]()
    originalJars.put("jar1", 3)

    val originalProperties = new Properties()
    originalProperties.put("property1", "18")
    originalProperties.put("property2", "test value")
    // SPARK-19796 -- large property values (like a large job description for a long sql query)
    // can cause problems for DataOutputStream, make sure we handle correctly
    val sb = new StringBuilder()
    (0 to 10000).foreach(_ => sb.append("1234567890"))
    val largeString = sb.toString()
    originalProperties.put("property3", largeString)
    // make sure we've got a good test case
    intercept[UTFDataFormatException] {
      val out = new DataOutputStream(new ByteArrayOutputStream())
      try {
        out.writeUTF(largeString)
      } finally {
        out.close()
      }
    }

    // Create a dummy byte buffer for the task.
    val taskBuffer = ByteBuffer.wrap(Array[Byte](1, 2, 3, 4))

    val originalTaskDescription = new TaskDescription(
      taskId = 1520589,
      attemptNumber = 2,
      executorId = "testExecutor",
      name = "task for test",
      index = 19,
      originalFiles,
      originalJars,
      originalProperties,
      taskBuffer
    )

    val serializedTaskDescription = TaskDescription.encode(originalTaskDescription)
    val decodedTaskDescription = TaskDescription.decode(serializedTaskDescription)

    // Make sure that all of the fields in the decoded task description match the original.
    assert(decodedTaskDescription.taskId === originalTaskDescription.taskId)
    assert(decodedTaskDescription.attemptNumber === originalTaskDescription.attemptNumber)
    assert(decodedTaskDescription.executorId === originalTaskDescription.executorId)
    assert(decodedTaskDescription.name === originalTaskDescription.name)
    assert(decodedTaskDescription.index === originalTaskDescription.index)
    assert(decodedTaskDescription.addedFiles.equals(originalFiles))
    assert(decodedTaskDescription.addedJars.equals(originalJars))
    assert(decodedTaskDescription.properties.equals(originalTaskDescription.properties))
    assert(decodedTaskDescription.serializedTask.equals(taskBuffer))
  }
}

Source File: KinesisSourceOffset.scala From kinesis-sql with Apache License 2.0

5 votes

package org.apache.spark.sql.kinesis

import org.json4s.NoTypeHints
import org.json4s.jackson.Serialization
import scala.collection.mutable.HashMap
import scala.util.control.NonFatal

import org.apache.spark.sql.execution.streaming.Offset
import org.apache.spark.sql.execution.streaming.SerializedOffset
import org.apache.spark.sql.sources.v2.reader.streaming.{Offset => OffsetV2, PartitionOffset}


 
  def apply(json: String): KinesisSourceOffset = {
    try {
      val readObj = Serialization.read[ Map[ String, Map[ String, String ] ] ](json)
      val metadata = readObj.get("metadata")
      val shardInfoMap: Map[String, ShardInfo ] = readObj.filter(_._1 != "metadata").map {
        case (shardId, value) => shardId.toString -> new ShardInfo(shardId.toString,
          value.get("iteratorType").get,
          value.get("iteratorPosition").get)
      }.toMap
      KinesisSourceOffset(
        new ShardOffsets(
          metadata.get("batchId").toLong,
          metadata.get("streamName"),
          shardInfoMap))
    } catch {
      case NonFatal(x) => throw new IllegalArgumentException(x)
    }
  }

  def getMap(shardInfos: Array[ShardInfo]): Map[String, ShardInfo] = {
    shardInfos.map {
      s: ShardInfo => (s.shardId -> s)
    }.toMap
  }

}

Source File: LocalKMeans.scala From BigDatalog with Apache License 2.0

5 votes

// scalastyle:off println
package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData: Array[DenseVector[Double]] = {
    def generatePoint(i: Int): DenseVector[Double] = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (p1, c1)), (id2, (p2, c2))) => (id1, (p1 + p2, c1 + c2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}
// scalastyle:on println

Source File: RecursivePlanDetails.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark

import org.apache.spark.sql.catalyst.plans.logical.LogicalPlan

import scala.collection.mutable.HashMap

class RecursivePlanDetails extends Serializable {
  private val baseRelationsByName = new HashMap[String, LogicalPlan]
  val recursiveRelations = new HashMap[String, LogicalPlan]
  val aggregateRelations = new HashMap[String, LogicalPlan]

  def addBaseRelation(name: String, obj: LogicalPlan) = {
    baseRelationsByName.put(name, obj)
  }

  def containsBaseRelation(name: String): Boolean = {
    baseRelationsByName.contains(name)
  }
}

Source File: CachedRDDManager.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark.execution.recursion

import org.apache.spark.Logging
import org.apache.spark.rdd.RDD
import org.apache.spark.storage.StorageLevel

import scala.collection.mutable.{HashMap, HashSet, Set}

class CachedRDDManager(defaultStorageLevel: StorageLevel)
  extends Logging with Serializable {

  val iterationToRDDMap = new HashMap[Int, HashSet[RDD[_]]]
  var currentIteration : Int = 0

  def persist(rdd: RDD[_]): Unit = {
    persist(rdd, false)
  }

  def persist(rdd: RDD[_], doMemoryCheckpoint: Boolean): Unit = {
    iterationToRDDMap.getOrElseUpdate(currentIteration, new HashSet[RDD[_]]).add(rdd)
    rdd.persist(defaultStorageLevel)

    if (doMemoryCheckpoint)
      rdd.memoryCheckpoint()
  }

  def cleanUpIteration(iterationsBackToRemove: Int = 2) = {
    val start = System.currentTimeMillis()
    if (currentIteration >= iterationsBackToRemove) {
      val iterationId = currentIteration - iterationsBackToRemove
      if (iterationToRDDMap.contains(iterationId)) {
        val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get
        if (rdds.nonEmpty)
          logInfo("Unpersisting "+rdds.size+" rdds for iteration " + iterationId)
        rdds.foreach(rdd => rdd.unpersist(false))
      }
    }
    logInfo("CleanUpIteration took " + (System.currentTimeMillis() - start) + " ms")
    currentIteration += 1
  }

  def cleanUpIterationById(iterationId: Int) = {
    if (iterationToRDDMap.contains(iterationId)) {
      val rdds: HashSet[RDD[_]] = iterationToRDDMap.remove(iterationId).get
      rdds.foreach(rdd => rdd.unpersist(false))
    }
  }

  def incrementIteration() { currentIteration += 1}

  def clear() = {
    iterationToRDDMap.clear()
  }

  def clear(remainCached: Seq[RDD[_]]) = {
    iterationToRDDMap.keySet.foreach(key => logInfo("key: " + key + " value: " + iterationToRDDMap.get(key)))

    iterationToRDDMap.keySet
      .foreach(key => iterationToRDDMap.get(key)
      .foreach(value => value.foreach(item => {if (!remainCached.contains(item)) item.unpersist(false)})))

    iterationToRDDMap.clear()
  }

  def unpersist(rdds: Set[RDD[_]]) = {
    for (rdd <- rdds) {
      iterationToRDDMap.synchronized {
        // rdd should only be in 1 iteration
        val iterations = iterationToRDDMap.filter(x => x._2.contains(rdd))
        if (iterations.nonEmpty) {
          val iteration = iterations.head
          iteration._2.remove(rdd)
          rdd.unpersist(false)
          if (iteration._2.isEmpty)
            iterationToRDDMap.remove(iteration._1)
        }
      }
    }
  }

  override def toString = {
    val output = new StringBuilder
    iterationToRDDMap.keySet.toSeq.sorted
      .foreach(iteration => {
        val rdds = iterationToRDDMap.get(iteration)
        rdds.foreach(rdd => output.append(iteration + ":" + rdd + "\n"))
      })
    output.toString()
  }
}

Source File: RelationCatalog.scala From BigDatalog with Apache License 2.0

5 votes

package edu.ucla.cs.wis.bigdatalog.spark

import org.apache.spark.rdd.RDD
import org.apache.spark.sql.catalyst.InternalRow
import org.apache.spark.sql.types.StructType

import scala.collection.mutable.HashMap

class RelationCatalog extends Serializable {
  val directory = HashMap.empty[String, RelationInfo]

  def addRelation(name : String, schema : StructType) : Unit = {
    val relationInfo = new RelationInfo().setSchema(schema)
    directory.get(name) match {
      case Some(oldRelationInfo) =>
        // update rdd if already present.  Schema should not change
        oldRelationInfo.setRDD(relationInfo.getRDD())
      case None => directory.put(name, relationInfo)
    }
  }

  def setRDD(name : String, rdd : RDD[InternalRow]) : Unit = {
    directory.get(name) match {
      case Some(oldRelationInfo) => oldRelationInfo.setRDD(rdd)
      case None => directory.put(name, new RelationInfo().setRDD(rdd))
    }
  }

  def getRelationInfo(name : String) : RelationInfo = {
    if (directory.contains(name))
      directory(name)
    else
      null
  }

  def removeRDD(name : String) : Unit = {
    directory.remove(name)
  }

  def clear() : Unit = {
    directory.clear()
  }

  override def toString(): String = {
    val output = new StringBuilder()
    directory.iterator.foreach(f => output.append(f.toString()))
    output.toString()
  }
}

class RelationInfo() extends Serializable {
  private var schema : StructType = _
  private var rdd : RDD[InternalRow] = _

  def getSchema() : StructType = schema

  def setSchema(schema : StructType) : RelationInfo = {
    this.schema = schema
    this
  }

  def getRDD() : RDD[InternalRow] = rdd

  def setRDD(rdd : RDD[InternalRow]) : RelationInfo = {
    this.rdd = rdd
    this
  }

  override def toString() : String = {
    "schema: " + this.schema + (if (rdd != null) " RDD")
  }
}

Source File: UIData.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import org.apache.spark.JobExecutionStatus
import org.apache.spark.executor.TaskMetrics
import org.apache.spark.scheduler.{AccumulableInfo, TaskInfo}
import org.apache.spark.util.collection.OpenHashSet

import scala.collection.mutable
import scala.collection.mutable.HashMap

private[spark] object UIData {

  class ExecutorSummary {
    var taskTime : Long = 0
    var failedTasks : Int = 0
    var succeededTasks : Int = 0
    var inputBytes : Long = 0
    var inputRecords : Long = 0
    var outputBytes : Long = 0
    var outputRecords : Long = 0
    var shuffleRead : Long = 0
    var shuffleReadRecords : Long = 0
    var shuffleWrite : Long = 0
    var shuffleWriteRecords : Long = 0
    var memoryBytesSpilled : Long = 0
    var diskBytesSpilled : Long = 0
  }

  class JobUIData(
    var jobId: Int = -1,
    var submissionTime: Option[Long] = None,
    var completionTime: Option[Long] = None,
    var stageIds: Seq[Int] = Seq.empty,
    var jobGroup: Option[String] = None,
    var status: JobExecutionStatus = JobExecutionStatus.UNKNOWN,
    
  case class TaskUIData(
      var taskInfo: TaskInfo,
      var taskMetrics: Option[TaskMetrics] = None,
      var errorMessage: Option[String] = None)

  case class ExecutorUIData(
      val startTime: Long,
      var finishTime: Option[Long] = None,
      var finishReason: Option[String] = None)
}

Source File: PoolTable.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.ui.jobs

import java.net.URLEncoder

import scala.collection.mutable.HashMap
import scala.xml.Node

import org.apache.spark.scheduler.{Schedulable, StageInfo}
import org.apache.spark.ui.UIUtils


private[ui] class PoolTable(pools: Seq[Schedulable], parent: StagesTab) {
  private val listener = parent.progressListener

  def toNodeSeq: Seq[Node] = {
    listener.synchronized {
      poolTable(poolRow, pools)
    }
  }

  private def poolTable(
      makeRow: (Schedulable, HashMap[String, HashMap[Int, StageInfo]]) => Seq[Node],
      rows: Seq[Schedulable]): Seq[Node] = {
    <table class="table table-bordered table-striped table-condensed sortable table-fixed">
      <thead>
        <th>Pool Name</th>
        <th>Minimum Share</th>
        <th>Pool Weight</th>
        <th>Active Stages</th>
        <th>Running Tasks</th>
        <th>SchedulingMode</th>
      </thead>
      <tbody>
        {rows.map(r => makeRow(r, listener.poolToActiveStages))}
      </tbody>
    </table>
  }

  private def poolRow(
      p: Schedulable,
      poolToActiveStages: HashMap[String, HashMap[Int, StageInfo]]): Seq[Node] = {
    val activeStages = poolToActiveStages.get(p.name) match {
      case Some(stages) => stages.size
      case None => 0
    }
    val href = "%s/stages/pool?poolname=%s"
      .format(UIUtils.prependBaseUri(parent.basePath), URLEncoder.encode(p.name, "UTF-8"))
    <tr>
      <td>
        <a href={href}>{p.name}</a>
      </td>
      <td>{p.minShare}</td>
      <td>{p.weight}</td>
      <td>{activeStages}</td>
      <td>{p.runningTasks}</td>
      <td>{p.schedulingMode}</td>
    </tr>
  }
}

Source File: StageInfo.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler

import scala.collection.mutable.HashMap

import org.apache.spark.annotation.DeveloperApi
import org.apache.spark.storage.RDDInfo


  def fromStage(
      stage: Stage,
      attemptId: Int,
      numTasks: Option[Int] = None,
      taskLocalityPreferences: Seq[Seq[TaskLocation]] = Seq.empty
    ): StageInfo = {
    val ancestorRddInfos = stage.rdd.getNarrowAncestors.map(RDDInfo.fromRdd)
    val rddInfos = Seq(RDDInfo.fromRdd(stage.rdd)) ++ ancestorRddInfos
    new StageInfo(
      stage.id,
      attemptId,
      stage.name,
      numTasks.getOrElse(stage.numTasks),
      rddInfos,
      stage.parents.map(_.id),
      stage.details,
      taskLocalityPreferences)
  }
}

Source File: FixedPointJobDefinition.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.scheduler.fixedpoint

import org.apache.spark.TaskContext
import org.apache.spark.rdd.RDD

import scala.collection.mutable.{HashSet, HashMap, Set}

class FixedPointJobDefinition(val setupIteration: (FixedPointJobDefinition, RDD[_]) => RDD[_],
                              val cleanupIteration: (Int) => Unit) {
  var _fixedPointEvaluator: (TaskContext, Iterator[_]) => Boolean = null
  var finalRDD: RDD[_] = null
  var rddIds = Array.empty[Int] // for all and delta rdd id for FixedPointResultTask execution on worker

  def fixedPointEvaluator(fixedPointEvaluator: (TaskContext, Iterator[_]) => Boolean) = {
    _fixedPointEvaluator = fixedPointEvaluator
  }

  def getfixedPointEvaluator = _fixedPointEvaluator.asInstanceOf[(TaskContext, Iterator[_]) => _]

  def getFinalRDD: RDD[_] = finalRDD

  def setRDDIds(newAllRDDId: Int,
                oldAllRDDId: Int,
                newDeltaPrimeRDDId: Int,
                oldDeltaPrimeRDDId: Int): Unit = {

    rddIds = Array(newAllRDDId, oldAllRDDId, newDeltaPrimeRDDId, oldDeltaPrimeRDDId)
  }
}

Source File: GroupedSumEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedSumEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val sum = entry.getValue.sum
        result.put(entry.getKey, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val meanEstimate = counter.mean
        val meanVar = counter.sampleVariance / counter.count
        val countEstimate = (counter.count + 1 - p) / p
        val countVar = (counter.count + 1) * (1 - p) / (p * p)
        val sumEstimate = meanEstimate * countEstimate
        val sumVar = (meanEstimate * meanEstimate * countVar) +
                     (countEstimate * countEstimate * meanVar) +
                     (meanVar * countVar)
        val sumStdev = math.sqrt(sumVar)
        val confFactor = studentTCacher.get(counter.count)
        val low = sumEstimate - confFactor * sumStdev
        val high = sumEstimate + confFactor * sumStdev
        result.put(entry.getKey, new BoundedDouble(sumEstimate, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: GroupedCountEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap
import scala.reflect.ClassTag

import org.apache.commons.math3.distribution.NormalDistribution

import org.apache.spark.util.collection.OpenHashMap


private[spark] class GroupedCountEvaluator[T : ClassTag](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[OpenHashMap[T, Long], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new OpenHashMap[T, Long]()   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: OpenHashMap[T, Long]) {
    outputsMerged += 1
    taskResult.foreach { case (key, value) =>
      sums.changeValue(key, value, _ + value)
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        result.put(key, new BoundedDouble(sum, 1.0, sum, sum))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val p = outputsMerged.toDouble / totalOutputs
      val confFactor = new NormalDistribution().
        inverseCumulativeProbability(1 - (1 - confidence) / 2)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      sums.foreach { case (key, sum) =>
        val mean = (sum + 1 - p) / p
        val variance = (sum + 1) * (1 - p) / (p * p)
        val stdev = math.sqrt(variance)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(key, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: GroupedMeanEvaluator.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark.partial

import java.util.{HashMap => JHashMap}

import scala.collection.JavaConverters._
import scala.collection.Map
import scala.collection.mutable.HashMap

import org.apache.spark.util.StatCounter


private[spark] class GroupedMeanEvaluator[T](totalOutputs: Int, confidence: Double)
  extends ApproximateEvaluator[JHashMap[T, StatCounter], Map[T, BoundedDouble]] {

  var outputsMerged = 0
  var sums = new JHashMap[T, StatCounter]   // Sum of counts for each key

  override def merge(outputId: Int, taskResult: JHashMap[T, StatCounter]) {
    outputsMerged += 1
    val iter = taskResult.entrySet.iterator()
    while (iter.hasNext) {
      val entry = iter.next()
      val old = sums.get(entry.getKey)
      if (old != null) {
        old.merge(entry.getValue)
      } else {
        sums.put(entry.getKey, entry.getValue)
      }
    }
  }

  override def currentResult(): Map[T, BoundedDouble] = {
    if (outputsMerged == totalOutputs) {
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val mean = entry.getValue.mean
        result.put(entry.getKey, new BoundedDouble(mean, 1.0, mean, mean))
      }
      result.asScala
    } else if (outputsMerged == 0) {
      new HashMap[T, BoundedDouble]
    } else {
      val studentTCacher = new StudentTCacher(confidence)
      val result = new JHashMap[T, BoundedDouble](sums.size)
      val iter = sums.entrySet.iterator()
      while (iter.hasNext) {
        val entry = iter.next()
        val counter = entry.getValue
        val mean = counter.mean
        val stdev = math.sqrt(counter.sampleVariance / counter.count)
        val confFactor = studentTCacher.get(counter.count)
        val low = mean - confFactor * stdev
        val high = mean + confFactor * stdev
        result.put(entry.getKey, new BoundedDouble(mean, confidence, low, high))
      }
      result.asScala
    }
  }
}

Source File: TaskContextImpl.scala From BigDatalog with Apache License 2.0

5 votes

package org.apache.spark

import scala.collection.mutable.{ArrayBuffer, HashMap}

import org.apache.spark.executor.TaskMetrics
import org.apache.spark.memory.TaskMemoryManager
import org.apache.spark.metrics.MetricsSystem
import org.apache.spark.metrics.source.Source
import org.apache.spark.util.{TaskCompletionListener, TaskCompletionListenerException}

private[spark] class TaskContextImpl(
    val stageId: Int,
    val partitionId: Int,
    override val taskAttemptId: Long,
    override val attemptNumber: Int,
    override val taskMemoryManager: TaskMemoryManager,
    @transient private val metricsSystem: MetricsSystem,
    internalAccumulators: Seq[Accumulator[Long]],
    val runningLocally: Boolean = false,
    val taskMetrics: TaskMetrics = TaskMetrics.empty)
  extends TaskContext
  with Logging {

  // For backwards-compatibility; this method is now deprecated as of 1.3.0.
  override def attemptId(): Long = taskAttemptId

  // List of callback functions to execute when the task completes.
  @transient private val onCompleteCallbacks = new ArrayBuffer[TaskCompletionListener]

  // Whether the corresponding task has been killed.
  @volatile private var interrupted: Boolean = false

  // Whether the task has completed.
  @volatile private var completed: Boolean = false

  override def addTaskCompletionListener(listener: TaskCompletionListener): this.type = {
    onCompleteCallbacks += listener
    this
  }

  override def addTaskCompletionListener(f: TaskContext => Unit): this.type = {
    onCompleteCallbacks += new TaskCompletionListener {
      override def onTaskCompletion(context: TaskContext): Unit = f(context)
    }
    this
  }

  @deprecated("use addTaskCompletionListener", "1.1.0")
  override def addOnCompleteCallback(f: () => Unit) {
    onCompleteCallbacks += new TaskCompletionListener {
      override def onTaskCompletion(context: TaskContext): Unit = f()
    }
  }

  
  private[spark] def markInterrupted(): Unit = {
    interrupted = true
  }

  override def isCompleted(): Boolean = completed

  override def isRunningLocally(): Boolean = runningLocally

  override def isInterrupted(): Boolean = interrupted

  override def getMetricsSources(sourceName: String): Seq[Source] =
    metricsSystem.getSourcesByName(sourceName)

  @transient private val accumulators = new HashMap[Long, Accumulable[_, _]]

  private[spark] override def registerAccumulator(a: Accumulable[_, _]): Unit = synchronized {
    accumulators(a.id) = a
  }

  private[spark] override def collectInternalAccumulators(): Map[Long, Any] = synchronized {
    accumulators.filter(_._2.isInternal).mapValues(_.localValue).toMap
  }

  private[spark] override def collectAccumulators(): Map[Long, Any] = synchronized {
    accumulators.mapValues(_.localValue).toMap
  }

  //private[spark]
  override val internalMetricsToAccumulators: Map[String, Accumulator[Long]] = {
    // Explicitly register internal accumulators here because these are
    // not captured in the task closure and are already deserialized
    internalAccumulators.foreach(registerAccumulator)
    internalAccumulators.map { a => (a.name.get, a) }.toMap
  }
}

Source File: LocalKMeans.scala From learning-spark with Apache License 2.0

5 votes

package org.apache.spark.examples

import java.util.Random

import scala.collection.mutable.HashMap
import scala.collection.mutable.HashSet

import breeze.linalg.{Vector, DenseVector, squaredDistance}

import org.apache.spark.SparkContext._


object LocalKMeans {
  val N = 1000
  val R = 1000    // Scaling factor
  val D = 10
  val K = 10
  val convergeDist = 0.001
  val rand = new Random(42)

  def generateData = {
    def generatePoint(i: Int) = {
      DenseVector.fill(D){rand.nextDouble * R}
    }
    Array.tabulate(N)(generatePoint)
  }

  def closestPoint(p: Vector[Double], centers: HashMap[Int, Vector[Double]]): Int = {
    var index = 0
    var bestIndex = 0
    var closest = Double.PositiveInfinity

    for (i <- 1 to centers.size) {
      val vCurr = centers.get(i).get
      val tempDist = squaredDistance(p, vCurr)
      if (tempDist < closest) {
        closest = tempDist
        bestIndex = i
      }
    }

    bestIndex
  }

  def showWarning() {
    System.err.println(
      """WARN: This is a naive implementation of KMeans Clustering and is given as an example!
        |Please use the KMeans method found in org.apache.spark.mllib.clustering
        |for more conventional use.
      """.stripMargin)
  }

  def main(args: Array[String]) {

    showWarning()

    val data = generateData
    var points = new HashSet[Vector[Double]]
    var kPoints = new HashMap[Int, Vector[Double]]
    var tempDist = 1.0

    while (points.size < K) {
      points.add(data(rand.nextInt(N)))
    }

    val iter = points.iterator
    for (i <- 1 to points.size) {
      kPoints.put(i, iter.next())
    }

    println("Initial centers: " + kPoints)

    while(tempDist > convergeDist) {
      var closest = data.map (p => (closestPoint(p, kPoints), (p, 1)))

      var mappings = closest.groupBy[Int] (x => x._1)

      var pointStats = mappings.map { pair =>
        pair._2.reduceLeft [(Int, (Vector[Double], Int))] {
          case ((id1, (x1, y1)), (id2, (x2, y2))) => (id1, (x1 + x2, y1 + y2))
        }
      }

      var newPoints = pointStats.map {mapping =>
        (mapping._1, mapping._2._1 * (1.0 / mapping._2._2))}

      tempDist = 0.0
      for (mapping <- newPoints) {
        tempDist += squaredDistance(kPoints.get(mapping._1).get, mapping._2)
      }

      for (newP <- newPoints) {
        kPoints.put(newP._1, newP._2)
      }
    }

    println("Final centers: " + kPoints)
  }
}

Source File: CollectionExample.scala From Scala-and-Spark-for-Big-Data-Analytics with MIT License

5 votes

package com.chapter3.ScalaFP
import scala.collection._
import scala.collection.mutable.Buffer
import scala.collection.mutable.HashMap

object CollectionExample {
  def main(args: Array[String]) {
    val x = 10
    val y = 15
    val z = 19
    
    Traversable(1, 2, 3)
    Iterable("x", "y", "z")
    Map("x" -> 10, "y" -> 13, "z" -> 17)
    Set("Red", "Green", "Blue")
    SortedSet("Hello,", "world!")
    Buffer(x, y, z)
    IndexedSeq(0.0, 1.0, 2.0)
    LinearSeq(x, y, z)
    List(2, 6, 10)
    HashMap("x" -> 20, "y" -> 19, "z" -> 16)
    
    val list = List(1, 2, 3) map (_ + 1)
    println(list)
    
    val set = Set(1, 2, 3) map (_ * 2)
    println(set)
    
    val list2 = List(x, y, z).map(x => x * 3)
    println(list2)
  }
}

Source File: NFAStructure.scala From piglet with Apache License 2.0

5 votes

package dbis.piglet.cep.nfa
import scala.collection.mutable.ArrayBuffer
import scala.reflect.ClassTag
import scala.collection.mutable.HashMap
import dbis.piglet.backends.{SchemaClass => Event}
import scala.collection.mutable.ListBuffer


  def addEvent(event: T, currentEdge: ForwardEdge[T]): Unit = {
    events += event
    //if (relatedValue != null) {
     // relatedValue.get(currentEdge.name.get) match {
       // case Some(x) => x.foreach (r => r.updateValue(event))
        //case None => Nil
      //}
    //}
    currenState = currentEdge.destState
    if (currenState.isInstanceOf[FinalState[T]])
      complete = true
  }
  
  override def clone(): NFAStructure[T] = {
    val copyStr = new NFAStructure[T](this.nfaController)
    copyStr.complete = this.complete
    copyStr.currenState = this.currenState
    copyStr.events = this.events.clone()
    //copyStr.events = this.events
    copyStr
  }
}

Source File: CorefUtils.scala From berkeley-doc-summarizer with GNU General Public License v3.0

5 votes

package edu.berkeley.nlp.summ

import scala.collection.mutable.ArrayBuffer
import scala.collection.mutable.HashMap
import edu.berkeley.nlp.entity.DepConstTree
import edu.berkeley.nlp.entity.coref.Mention
import edu.berkeley.nlp.entity.coref.PronounDictionary
import edu.berkeley.nlp.entity.coref.MentionType
import edu.berkeley.nlp.entity.coref.CorefDoc
import edu.berkeley.nlp.entity.GUtil
import edu.berkeley.nlp.futile.math.SloppyMath

object CorefUtils {
  
  def getAntecedent(corefDoc: CorefDoc, predictor: edu.berkeley.nlp.entity.coref.PairwiseScorer, index: Int) = {
    val posteriors = computePosteriors(corefDoc, predictor, Seq(index))
    GUtil.argMaxIdx(posteriors(0))
  }
  
  def computePosteriors(corefDoc: CorefDoc, predictor: edu.berkeley.nlp.entity.coref.PairwiseScorer, indicesOfInterest: Seq[Int]): Array[Array[Double]] = {
    val docGraph = new edu.berkeley.nlp.entity.coref.DocumentGraph(corefDoc, false)
    Array.tabulate(indicesOfInterest.size)(idxIdxOfInterest => {
      val idx = indicesOfInterest(idxIdxOfInterest)
      val scores = Array.tabulate(idx+1)(antIdx => predictor.score(docGraph, idx, antIdx, false).toDouble)
      val logNormalizer = scores.foldLeft(Double.NegativeInfinity)(SloppyMath.logAdd(_, _))
      for (antIdx <- 0 until scores.size) {
        scores(antIdx) = scores(antIdx) - logNormalizer
      }
      scores
    })
  }
  
  
  def remapMentionType(ment: Mention) = {
    val newMentionType = if (ment.endIdx - ment.startIdx == 1 && PronounDictionary.isDemonstrative(ment.rawDoc.words(ment.sentIdx)(ment.headIdx))) {
      MentionType.DEMONSTRATIVE;
    } else if (ment.endIdx - ment.startIdx == 1 && PronounDictionary.isPronLc(ment.rawDoc.words(ment.sentIdx)(ment.headIdx))) {
      MentionType.PRONOMINAL;
    } else if (ment.rawDoc.pos(ment.sentIdx)(ment.headIdx) == "NNS" || ment.rawDoc.pos(ment.sentIdx)(ment.headIdx) == "NNPS") {
      MentionType.PROPER;
    } else {
      MentionType.NOMINAL;
    }
    new Mention(ment.rawDoc,
                ment.mentIdx,
                ment.sentIdx,
                ment.startIdx,
                ment.endIdx,
                ment.headIdx,
                ment.allHeadIndices,
                ment.isCoordinated,
                newMentionType,
                ment.nerString,
                ment.number,
                ment.gender)
                
  }
  
  def getMentionText(ment: Mention) = ment.rawDoc.words(ment.sentIdx).slice(ment.startIdx, ment.endIdx)
  
  def getMentionNerSpan(ment: Mention): Option[(Int,Int)] = {
    // Smallest NER chunk that contains the head
    val conllDoc = ment.rawDoc
    val matchingChunks = conllDoc.nerChunks(ment.sentIdx).filter(chunk => chunk.start <= ment.headIdx && ment.headIdx < chunk.end);
    if (!matchingChunks.isEmpty) {
      val smallestChunk = matchingChunks.sortBy(chunk => chunk.end - chunk.start).head;
      Some(smallestChunk.start -> smallestChunk.end)
    } else {
      None
    }
  }

  def getSpanHeads(tree: DepConstTree, startIdx: Int, endIdx: Int): Seq[Int] = getSpanHeads(tree.childParentDepMap, startIdx, endIdx);
  
  def getSpanHeads(childParentDepMap: HashMap[Int,Int], startIdx: Int, endIdx: Int): Seq[Int] = {
    // If it's a constituent, only one should have a head outside
    val outsidePointing = new ArrayBuffer[Int];
    for (i <- startIdx until endIdx) {
      val ptr = childParentDepMap(i);
      if (ptr < startIdx || ptr >= endIdx) {
        outsidePointing += i;
      }
    }
    outsidePointing
  }
  
  def isDefinitelyPerson(str: String): Boolean = {
    val canonicalization = PronounDictionary.canonicalize(str)
    // N.B. Don't check "we" or "they" because those might be used in inanimate cases
    canonicalization == "i" || canonicalization == "you" || canonicalization == "he" || canonicalization == "she" 
  }
}

scala.collection.mutable.HashMap Scala Examples